Merge branch 'release-4-6'
authorTeemu Murtola <teemu.murtola@gmail.com>
Fri, 24 May 2013 17:59:13 +0000 (20:59 +0300)
committerTeemu Murtola <teemu.murtola@gmail.com>
Fri, 24 May 2013 17:59:13 +0000 (20:59 +0300)
Omitted the patch with the hacks for silencing warnings with gcc-4.8,
since we plan to do a proper job of that in master branch.

Some minor conflicts resolved.

Conflicts:
CMakeLists.txt
cmake/ThreadMPI.cmake
src/config.h.cmakein
src/programs/mdrun/md.c

Change-Id: I2c1f1b9b40100c269eea6b06b7b073491b5e17d6

170 files changed:
CMakeLists.txt
cmake/TestAtomics.c
cmake/ThreadMPI.cmake
cmake/Toolchain-Fujitsu-Sparc64-mpi.cmake [new file with mode: 0644]
cmake/Toolchain-Fujitsu-Sparc64.cmake [new file with mode: 0644]
share/html/online/mdp_opt.html
src/config.h.cmakein
src/gromacs/gmxana/gmx_genion.c
src/gromacs/gmxlib/gmx_cpuid.c
src/gromacs/gmxlib/nonbonded/CMakeLists.txt
src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/kernelutil_sparc64_hpc_ace_double.h [new file with mode: 0644]
src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/make_nb_kernel_sparc64_hpc_ace_double.py [new file with mode: 0755]
src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecCSTab_VdwCSTab_GeomP1P1_sparc64_hpc_ace_double.c [new file with mode: 0644]
src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecCSTab_VdwCSTab_GeomW3P1_sparc64_hpc_ace_double.c [new file with mode: 0644]
src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecCSTab_VdwCSTab_GeomW3W3_sparc64_hpc_ace_double.c [new file with mode: 0644]
src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecCSTab_VdwCSTab_GeomW4P1_sparc64_hpc_ace_double.c [new file with mode: 0644]
src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecCSTab_VdwCSTab_GeomW4W4_sparc64_hpc_ace_double.c [new file with mode: 0644]
src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecCSTab_VdwLJ_GeomP1P1_sparc64_hpc_ace_double.c [new file with mode: 0644]
src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecCSTab_VdwLJ_GeomW3P1_sparc64_hpc_ace_double.c [new file with mode: 0644]
src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecCSTab_VdwLJ_GeomW3W3_sparc64_hpc_ace_double.c [new file with mode: 0644]
src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecCSTab_VdwLJ_GeomW4P1_sparc64_hpc_ace_double.c [new file with mode: 0644]
src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecCSTab_VdwLJ_GeomW4W4_sparc64_hpc_ace_double.c [new file with mode: 0644]
src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecCSTab_VdwNone_GeomP1P1_sparc64_hpc_ace_double.c [new file with mode: 0644]
src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecCSTab_VdwNone_GeomW3P1_sparc64_hpc_ace_double.c [new file with mode: 0644]
src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecCSTab_VdwNone_GeomW3W3_sparc64_hpc_ace_double.c [new file with mode: 0644]
src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecCSTab_VdwNone_GeomW4P1_sparc64_hpc_ace_double.c [new file with mode: 0644]
src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecCSTab_VdwNone_GeomW4W4_sparc64_hpc_ace_double.c [new file with mode: 0644]
src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecCoul_VdwCSTab_GeomP1P1_sparc64_hpc_ace_double.c [new file with mode: 0644]
src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecCoul_VdwCSTab_GeomW3P1_sparc64_hpc_ace_double.c [new file with mode: 0644]
src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecCoul_VdwCSTab_GeomW3W3_sparc64_hpc_ace_double.c [new file with mode: 0644]
src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecCoul_VdwCSTab_GeomW4P1_sparc64_hpc_ace_double.c [new file with mode: 0644]
src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecCoul_VdwCSTab_GeomW4W4_sparc64_hpc_ace_double.c [new file with mode: 0644]
src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecCoul_VdwLJ_GeomP1P1_sparc64_hpc_ace_double.c [new file with mode: 0644]
src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecCoul_VdwLJ_GeomW3P1_sparc64_hpc_ace_double.c [new file with mode: 0644]
src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecCoul_VdwLJ_GeomW3W3_sparc64_hpc_ace_double.c [new file with mode: 0644]
src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecCoul_VdwLJ_GeomW4P1_sparc64_hpc_ace_double.c [new file with mode: 0644]
src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecCoul_VdwLJ_GeomW4W4_sparc64_hpc_ace_double.c [new file with mode: 0644]
src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecCoul_VdwNone_GeomP1P1_sparc64_hpc_ace_double.c [new file with mode: 0644]
src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecCoul_VdwNone_GeomW3P1_sparc64_hpc_ace_double.c [new file with mode: 0644]
src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecCoul_VdwNone_GeomW3W3_sparc64_hpc_ace_double.c [new file with mode: 0644]
src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecCoul_VdwNone_GeomW4P1_sparc64_hpc_ace_double.c [new file with mode: 0644]
src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecCoul_VdwNone_GeomW4W4_sparc64_hpc_ace_double.c [new file with mode: 0644]
src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecEwSh_VdwLJSh_GeomP1P1_sparc64_hpc_ace_double.c [new file with mode: 0644]
src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecEwSh_VdwLJSh_GeomW3P1_sparc64_hpc_ace_double.c [new file with mode: 0644]
src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecEwSh_VdwLJSh_GeomW3W3_sparc64_hpc_ace_double.c [new file with mode: 0644]
src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecEwSh_VdwLJSh_GeomW4P1_sparc64_hpc_ace_double.c [new file with mode: 0644]
src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecEwSh_VdwLJSh_GeomW4W4_sparc64_hpc_ace_double.c [new file with mode: 0644]
src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecEwSh_VdwNone_GeomP1P1_sparc64_hpc_ace_double.c [new file with mode: 0644]
src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecEwSh_VdwNone_GeomW3P1_sparc64_hpc_ace_double.c [new file with mode: 0644]
src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecEwSh_VdwNone_GeomW3W3_sparc64_hpc_ace_double.c [new file with mode: 0644]
src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecEwSh_VdwNone_GeomW4P1_sparc64_hpc_ace_double.c [new file with mode: 0644]
src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecEwSh_VdwNone_GeomW4W4_sparc64_hpc_ace_double.c [new file with mode: 0644]
src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecEwSw_VdwLJSw_GeomP1P1_sparc64_hpc_ace_double.c [new file with mode: 0644]
src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecEwSw_VdwLJSw_GeomW3P1_sparc64_hpc_ace_double.c [new file with mode: 0644]
src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecEwSw_VdwLJSw_GeomW3W3_sparc64_hpc_ace_double.c [new file with mode: 0644]
src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecEwSw_VdwLJSw_GeomW4P1_sparc64_hpc_ace_double.c [new file with mode: 0644]
src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecEwSw_VdwLJSw_GeomW4W4_sparc64_hpc_ace_double.c [new file with mode: 0644]
src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecEwSw_VdwNone_GeomP1P1_sparc64_hpc_ace_double.c [new file with mode: 0644]
src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecEwSw_VdwNone_GeomW3P1_sparc64_hpc_ace_double.c [new file with mode: 0644]
src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecEwSw_VdwNone_GeomW3W3_sparc64_hpc_ace_double.c [new file with mode: 0644]
src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecEwSw_VdwNone_GeomW4P1_sparc64_hpc_ace_double.c [new file with mode: 0644]
src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecEwSw_VdwNone_GeomW4W4_sparc64_hpc_ace_double.c [new file with mode: 0644]
src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecEw_VdwCSTab_GeomP1P1_sparc64_hpc_ace_double.c [new file with mode: 0644]
src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecEw_VdwCSTab_GeomW3P1_sparc64_hpc_ace_double.c [new file with mode: 0644]
src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecEw_VdwCSTab_GeomW3W3_sparc64_hpc_ace_double.c [new file with mode: 0644]
src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecEw_VdwCSTab_GeomW4P1_sparc64_hpc_ace_double.c [new file with mode: 0644]
src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecEw_VdwCSTab_GeomW4W4_sparc64_hpc_ace_double.c [new file with mode: 0644]
src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecEw_VdwLJ_GeomP1P1_sparc64_hpc_ace_double.c [new file with mode: 0644]
src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecEw_VdwLJ_GeomW3P1_sparc64_hpc_ace_double.c [new file with mode: 0644]
src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecEw_VdwLJ_GeomW3W3_sparc64_hpc_ace_double.c [new file with mode: 0644]
src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecEw_VdwLJ_GeomW4P1_sparc64_hpc_ace_double.c [new file with mode: 0644]
src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecEw_VdwLJ_GeomW4W4_sparc64_hpc_ace_double.c [new file with mode: 0644]
src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecEw_VdwNone_GeomP1P1_sparc64_hpc_ace_double.c [new file with mode: 0644]
src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecEw_VdwNone_GeomW3P1_sparc64_hpc_ace_double.c [new file with mode: 0644]
src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecEw_VdwNone_GeomW3W3_sparc64_hpc_ace_double.c [new file with mode: 0644]
src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecEw_VdwNone_GeomW4P1_sparc64_hpc_ace_double.c [new file with mode: 0644]
src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecEw_VdwNone_GeomW4W4_sparc64_hpc_ace_double.c [new file with mode: 0644]
src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecGB_VdwCSTab_GeomP1P1_sparc64_hpc_ace_double.c [new file with mode: 0644]
src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecGB_VdwLJ_GeomP1P1_sparc64_hpc_ace_double.c [new file with mode: 0644]
src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecGB_VdwNone_GeomP1P1_sparc64_hpc_ace_double.c [new file with mode: 0644]
src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecNone_VdwCSTab_GeomP1P1_sparc64_hpc_ace_double.c [new file with mode: 0644]
src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecNone_VdwLJSh_GeomP1P1_sparc64_hpc_ace_double.c [new file with mode: 0644]
src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecNone_VdwLJSw_GeomP1P1_sparc64_hpc_ace_double.c [new file with mode: 0644]
src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecNone_VdwLJ_GeomP1P1_sparc64_hpc_ace_double.c [new file with mode: 0644]
src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecRFCut_VdwCSTab_GeomP1P1_sparc64_hpc_ace_double.c [new file with mode: 0644]
src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecRFCut_VdwCSTab_GeomW3P1_sparc64_hpc_ace_double.c [new file with mode: 0644]
src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecRFCut_VdwCSTab_GeomW3W3_sparc64_hpc_ace_double.c [new file with mode: 0644]
src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecRFCut_VdwCSTab_GeomW4P1_sparc64_hpc_ace_double.c [new file with mode: 0644]
src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecRFCut_VdwCSTab_GeomW4W4_sparc64_hpc_ace_double.c [new file with mode: 0644]
src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecRFCut_VdwLJSh_GeomP1P1_sparc64_hpc_ace_double.c [new file with mode: 0644]
src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecRFCut_VdwLJSh_GeomW3P1_sparc64_hpc_ace_double.c [new file with mode: 0644]
src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecRFCut_VdwLJSh_GeomW3W3_sparc64_hpc_ace_double.c [new file with mode: 0644]
src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecRFCut_VdwLJSh_GeomW4P1_sparc64_hpc_ace_double.c [new file with mode: 0644]
src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecRFCut_VdwLJSh_GeomW4W4_sparc64_hpc_ace_double.c [new file with mode: 0644]
src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecRFCut_VdwLJSw_GeomP1P1_sparc64_hpc_ace_double.c [new file with mode: 0644]
src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecRFCut_VdwLJSw_GeomW3P1_sparc64_hpc_ace_double.c [new file with mode: 0644]
src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecRFCut_VdwLJSw_GeomW3W3_sparc64_hpc_ace_double.c [new file with mode: 0644]
src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecRFCut_VdwLJSw_GeomW4P1_sparc64_hpc_ace_double.c [new file with mode: 0644]
src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecRFCut_VdwLJSw_GeomW4W4_sparc64_hpc_ace_double.c [new file with mode: 0644]
src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecRFCut_VdwNone_GeomP1P1_sparc64_hpc_ace_double.c [new file with mode: 0644]
src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecRFCut_VdwNone_GeomW3P1_sparc64_hpc_ace_double.c [new file with mode: 0644]
src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecRFCut_VdwNone_GeomW3W3_sparc64_hpc_ace_double.c [new file with mode: 0644]
src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecRFCut_VdwNone_GeomW4P1_sparc64_hpc_ace_double.c [new file with mode: 0644]
src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecRFCut_VdwNone_GeomW4W4_sparc64_hpc_ace_double.c [new file with mode: 0644]
src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecRF_VdwCSTab_GeomP1P1_sparc64_hpc_ace_double.c [new file with mode: 0644]
src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecRF_VdwCSTab_GeomW3P1_sparc64_hpc_ace_double.c [new file with mode: 0644]
src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecRF_VdwCSTab_GeomW3W3_sparc64_hpc_ace_double.c [new file with mode: 0644]
src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecRF_VdwCSTab_GeomW4P1_sparc64_hpc_ace_double.c [new file with mode: 0644]
src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecRF_VdwCSTab_GeomW4W4_sparc64_hpc_ace_double.c [new file with mode: 0644]
src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecRF_VdwLJ_GeomP1P1_sparc64_hpc_ace_double.c [new file with mode: 0644]
src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecRF_VdwLJ_GeomW3P1_sparc64_hpc_ace_double.c [new file with mode: 0644]
src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecRF_VdwLJ_GeomW3W3_sparc64_hpc_ace_double.c [new file with mode: 0644]
src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecRF_VdwLJ_GeomW4P1_sparc64_hpc_ace_double.c [new file with mode: 0644]
src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecRF_VdwLJ_GeomW4W4_sparc64_hpc_ace_double.c [new file with mode: 0644]
src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecRF_VdwNone_GeomP1P1_sparc64_hpc_ace_double.c [new file with mode: 0644]
src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecRF_VdwNone_GeomW3P1_sparc64_hpc_ace_double.c [new file with mode: 0644]
src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecRF_VdwNone_GeomW3W3_sparc64_hpc_ace_double.c [new file with mode: 0644]
src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecRF_VdwNone_GeomW4P1_sparc64_hpc_ace_double.c [new file with mode: 0644]
src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecRF_VdwNone_GeomW4W4_sparc64_hpc_ace_double.c [new file with mode: 0644]
src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_sparc64_hpc_ace_double.c [new file with mode: 0644]
src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_sparc64_hpc_ace_double.h [new file with mode: 0644]
src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_template_sparc64_hpc_ace_double.pre [new file with mode: 0644]
src/gromacs/gmxlib/nonbonded/nonbonded.c
src/gromacs/gmxlib/thread_mpi/CMakeLists.txt
src/gromacs/gmxlib/thread_mpi/atomic.c [new file with mode: 0644]
src/gromacs/gmxlib/thread_mpi/barrier.c
src/gromacs/gmxlib/thread_mpi/bcast.c
src/gromacs/gmxlib/thread_mpi/collective.c
src/gromacs/gmxlib/thread_mpi/collective.h
src/gromacs/gmxlib/thread_mpi/comm.c
src/gromacs/gmxlib/thread_mpi/errhandler.c
src/gromacs/gmxlib/thread_mpi/gather.c
src/gromacs/gmxlib/thread_mpi/impl.h
src/gromacs/gmxlib/thread_mpi/lock.c
src/gromacs/gmxlib/thread_mpi/once.c
src/gromacs/gmxlib/thread_mpi/p2p_protocol.c
src/gromacs/gmxlib/thread_mpi/p2p_send_recv.c
src/gromacs/gmxlib/thread_mpi/profile.c
src/gromacs/gmxlib/thread_mpi/profile.h
src/gromacs/gmxlib/thread_mpi/pthreads.c
src/gromacs/gmxlib/thread_mpi/scatter.c
src/gromacs/gmxlib/thread_mpi/settings.h
src/gromacs/gmxlib/thread_mpi/system_error.cpp
src/gromacs/gmxlib/thread_mpi/tmpi_init.c
src/gromacs/gmxlib/thread_mpi/tmpi_malloc.c
src/gromacs/gmxlib/thread_mpi/winthreads.c
src/gromacs/gmxlib/tpxio.c
src/gromacs/gmxpreprocess/readir.c
src/gromacs/legacyheaders/gmx_cpuid.h
src/gromacs/legacyheaders/thread_mpi/atomic.h
src/gromacs/legacyheaders/thread_mpi/atomic/cycles.h
src/gromacs/legacyheaders/thread_mpi/atomic/derived.h [new file with mode: 0644]
src/gromacs/legacyheaders/thread_mpi/atomic/fujitsu_sparc64.h [new file with mode: 0644]
src/gromacs/legacyheaders/thread_mpi/atomic/gcc.h
src/gromacs/legacyheaders/thread_mpi/atomic/gcc_ia64.h
src/gromacs/legacyheaders/thread_mpi/atomic/gcc_intrinsics.h
src/gromacs/legacyheaders/thread_mpi/atomic/gcc_ppc.h
src/gromacs/legacyheaders/thread_mpi/atomic/gcc_spinlock.h
src/gromacs/legacyheaders/thread_mpi/atomic/gcc_x86.h
src/gromacs/legacyheaders/thread_mpi/atomic/msvc.h
src/gromacs/legacyheaders/thread_mpi/atomic/suncc-sparc.h
src/gromacs/legacyheaders/thread_mpi/atomic/xlc_ppc.h
src/gromacs/legacyheaders/thread_mpi/event.h
src/gromacs/legacyheaders/thread_mpi/lock.h
src/gromacs/legacyheaders/thread_mpi/threads.h
src/gromacs/legacyheaders/thread_mpi/tmpi.h
src/gromacs/legacyheaders/types/idef.h
src/gromacs/mdlib/force.c
src/gromacs/mdlib/minimize.c
src/programs/mdrun/md.c

index 8db354047af569599ce12cf9bc8a4a265c94d662..ede6b0ee511ef8d7702c23d443caa572680b6bef 100644 (file)
@@ -145,7 +145,7 @@ if(NOT DEFINED GMX_CPU_ACCELERATION)
 endif(NOT DEFINED GMX_CPU_ACCELERATION)
 
 set(GMX_CPU_ACCELERATION "@GMX_SUGGESTED_CPU_ACCELERATION@"
-    CACHE STRING "Accelerated CPU kernels. Pick one of: None, SSE2, SSE4.1, AVX_128_FMA, AVX_256, IBM_QPX")
+    CACHE STRING "Accelerated CPU kernels. Pick one of: None, SSE2, SSE4.1, AVX_128_FMA, AVX_256, IBM_QPX, Sparc64_HPC_ACE")
 
 set(GMX_FFT_LIBRARY "fftw3" 
     CACHE STRING "FFT library choices: fftw3,mkl,fftpack[built-in]")
@@ -500,11 +500,13 @@ endif(GMX_X11)
 include(ThreadMPI)
 set(THREAD_MPI_LIB thread_mpi)
 if(GMX_THREAD_MPI)
-    tmpi_get_source_list(THREAD_MPI_SRC CXX)
+    tmpi_enable(CXX)
     set(PKG_CFLAGS "${PKG_CFLAGS} -DGMX_THREAD_MPI")
     set(GMX_MPI 1)
+    tmpi_get_source_list(THREAD_MPI_SRC)
 else(GMX_THREAD_MPI)
-    tmpi_get_source_list(THREAD_MPI_SRC CXX NOMPI)
+    tmpi_enable(CXX NOMPI)
+    tmpi_get_source_list(THREAD_MPI_SRC)
 endif(GMX_THREAD_MPI)
 
 if(GMX_GPU)
@@ -858,9 +860,10 @@ elseif(${GMX_CPU_ACCELERATION} STREQUAL "IBM_QPX")
     else()
         message(FATAL_ERROR "Cannot compile IBM QPX intrinsics without the XL compiler. If you are compiling for BlueGene/Q, use 'cmake .. -DCMAKE_TOOLCHAIN_FILE=BlueGeneQ-static-XL-C' to set up the tool chain.")
     endif()
-
+elseif(${GMX_CPU_ACCELERATION} STREQUAL "SPARC64_HPC_ACE")
+    set(GMX_CPU_ACCELERATION_SPARC64_HPC_ACE 1)
 else(${GMX_CPU_ACCELERATION} STREQUAL "NONE")
-    MESSAGE(FATAL_ERROR "Unrecognized option for accelerated kernels: ${GMX_CPU_ACCELERATION}. Pick one of None, SSE2, SSE4.1, AVX_128_FMA, AVX_256, IBM_QPX")
+    MESSAGE(FATAL_ERROR "Unrecognized option for accelerated kernels: ${GMX_CPU_ACCELERATION}. Pick one of None, SSE2, SSE4.1, AVX_128_FMA, AVX_256, IBM_QPX, Sparc64_HPC_ACE")
 endif(${GMX_CPU_ACCELERATION} STREQUAL "NONE")
 set(ACCELERATION_QUIETLY TRUE CACHE INTERNAL "")
 
@@ -912,11 +915,11 @@ if(${GMX_FFT_LIBRARY} STREQUAL "FFTW3")
 
     set(GMX_FFT_FFTW3 1)
 
-    if (NOT ${GMX_CPU_ACCELERATION} STREQUAL "NONE" AND NOT ${FFTW}_HAVE_SIMD) 
+    if ((${GMX_CPU_ACCELERATION} MATCHES "SSE" OR ${GMX_CPU_ACCELERATION} MATCHES "AVX") AND NOT ${FFTW}_HAVE_SIMD)
       message(WARNING "The fftw library found is compiled without SIMD support, which makes it slow. Consider recompiling it or contact your admin")
     endif()
 
-    if(NOT ${GMX_CPU_ACCELERATION} STREQUAL "NONE" AND ${FFTW}_HAVE_AVX)
+    if((${GMX_CPU_ACCELERATION} MATCHES "SSE" OR ${GMX_CPU_ACCELERATION} MATCHES "AVX") AND ${FFTW}_HAVE_AVX)
         # If we're not doing CPU acceleration, we don't care about FFTW performance on x86 either
         message(WARNING "The FFTW library was compiled with --enable-avx to enable AVX SIMD instructions. That might sound like a good idea for your processor, but for FFTW versions up to 3.3.3, these are slower than the SSE/SSE2 SIMD instructions for the way GROMACS uses FFTs. Limitations in the way FFTW allows GROMACS to measure performance make it awkward for either GROMACS or FFTW to make the decision for you based on runtime performance. You should compile a different FFTW library with --enable-sse or --enable-sse2. If you have a more recent FFTW, you may like to compare the performance of GROMACS with FFTW libraries compiled with and without --enable-avx. However, the GROMACS developers do not really expect the FFTW AVX optimization to help, because the performance is limited by memory access, not computation.")
     endif()
index 68f0343bded4e313cdd28128fcf26fac77cc8306..ebc7f6b002dd4082cdb543bf863034d0336ee722 100644 (file)
@@ -6,5 +6,3 @@ int main(void)
 {
     return 0;
 }
-
-
index fc9183afc2f1a78240caa8b1fbe34496b9cee505..bcd280ae08d48b7d9d2e31d4a1c8c9a740d73231 100644 (file)
@@ -1,31 +1,53 @@
 
 include(CheckIncludeFiles)
 include(CheckFunctionExists)
-#include(CheckCSourceCompiles)
-
-#option(THREAD_PTHREADS "Use posix threads" ON)
+include(CheckCSourceCompiles)
 
-MACRO(TEST_TMPI_ATOMICS VARIABLE)
+# sets TMPI_ATOMICS to 1 if atomic operations are found, 0 otherwise
+MACRO(TMPI_TEST_ATOMICS)
     if (NOT DEFINED TMPI_ATOMICS)
         try_compile(TEST_ATOMICS "${CMAKE_BINARY_DIR}"
                 "${CMAKE_SOURCE_DIR}/cmake/TestAtomics.c"
                 COMPILE_DEFINITIONS "-I${CMAKE_SOURCE_DIR}/src/gromacs/legacyheaders" )
 
         if (TEST_ATOMICS)
-            message(STATUS "Atomics found")
-            set(${VARIABLE} TRUE CACHE INTERNAL "Whether atomic operations for thread-MPI were found")
+            message(STATUS "Atomic operations found")
         else (TEST_ATOMICS)
-            message(WARNING "Atomic operations not found for this CPU+compiler combination. Thread support will be unbearably slow: disable threads. Atomic operations should work on all but the most obscure CPU+compiler combinations; if your system is not obscure -- like, for example, x86 with gcc --  please contact the developers.")
-            set(${VARIABLE} FALSE CACHE INTERNAL "Whether atomic operations for thread-MPI were found")
+            message(STATUS "Atomic operations not found")
         endif(TEST_ATOMICS)
+        set(TMPI_ATOMICS ${TEST_ATOMICS} CACHE INTERNAL "Whether atomic operations are found")
     endif(NOT DEFINED TMPI_ATOMICS)
-ENDMACRO(TEST_TMPI_ATOMICS VARIABLE)
+ENDMACRO(TMPI_TEST_ATOMICS VARIABLE)
 
-MACRO(TMPI_MAKE_CXX_LIB)
-    set(TMPI_CXX_LIB 1)
-ENDMACRO(TMPI_MAKE_CXX_LIB)
+TMPI_TEST_ATOMICS()
 
-MACRO(TMPI_GET_SOURCE_LIST SRC_VARIABLE)
+include(FindThreads)
+if (CMAKE_USE_PTHREADS_INIT)
+    check_include_files(pthread.h    HAVE_PTHREAD_H)
+    set(THREAD_PTHREADS 1)
+    set(THREAD_LIB ${CMAKE_THREAD_LIBS_INIT})
+elseif (CMAKE_USE_WIN32_THREADS_INIT)
+    set(THREAD_WINDOWS 1)
+    set(THREAD_LIB)
+else ()
+    message(FATAL_ERROR "Thread support required")
+endif (CMAKE_USE_PTHREADS_INIT)
+
+# Turns on thread_mpi.
+# options are:
+# CXX: enable C++ library build.
+MACRO(TMPI_ENABLE)
+    # first check whether threads and atomics are available.
+    if(NOT TMPI_ATOMICS)
+        # check again, to allow the user to fix this.
+        unset(TMPI_ATOMICS CACHE)
+        TMPI_TEST_ATOMICS()
+    endif(NOT TMPI_ATOMICS)
+    if(NOT TMPI_ATOMICS)
+        message(WARNING "Atomic operations not found for this CPU+compiler combination. Thread support will be unbearably slow: disable threads. Atomic operations should work on all but the most obscure CPU+compiler combinations; if your system is not obscure -- like, for example, x86 with gcc --  please contact the developers.")
+    endif(NOT TMPI_ATOMICS)
+
+    set(TMPI_ENABLED 1)
     foreach (_option IN ITEMS ${ARGN})
         if (_option STREQUAL "CXX")
             set(TMPI_CXX_LIB 1)
@@ -35,9 +57,87 @@ MACRO(TMPI_GET_SOURCE_LIST SRC_VARIABLE)
             message(FATAL_ERROR "Unknown thread_mpi option '${_option}'")
         endif ()
     endforeach ()
+
+    #tmpi_test_atomics(TMPI_ATOMICS)
+
+# the spin-waiting option
+    option(THREAD_MPI_WAIT_FOR_NO_ONE "Use busy waits without yielding to the OS scheduler. Turning this on might improve performance (very) slightly at the cost of very poor performance if the threads are competing for CPU time." OFF)
+    mark_as_advanced(THREAD_MPI_WAIT_FOR_NO_ONE)
+    if (THREAD_MPI_WAIT_FOR_NO_ONE)
+        set(TMPI_WAIT_FOR_NO_ONE 1)
+    else (THREAD_MPI_WAIT_FOR_NO_ONE)
+        set(TMPI_WAIT_FOR_NO_ONE 0)
+    endif (THREAD_MPI_WAIT_FOR_NO_ONE)
+
+# the copy buffer option
+    option(THREAD_MPI_COPY_BUFFER "Use an intermediate copy buffer for small message sizes, to allow blocking sends to return quickly. Only useful in programs with relatively uncoupled threads (infrequent MPI communication)" OFF)
+    mark_as_advanced(THREAD_MPI_COPY_BUFFER)
+    if (THREAD_MPI_COPY_BUFFER)
+        set(TMPI_COPY_BUFFER 1)
+    else (THREAD_MPI_COPY_BUFFER)
+        set(TMPI_COPY_BUFFER 0)
+    endif (THREAD_MPI_COPY_BUFFER)
+
+# the profiling option
+    option(THREAD_MPI_PROFILING "Turn on simple MPI profiling." OFF)
+    mark_as_advanced(THREAD_MPI_PROFILING)
+    if (THREAD_MPI_PROFILING)
+        set(TMPI_PROFILE 1)
+    else (THREAD_MPI_PROFILING)
+        set(TMPI_PROFILE 0)
+    endif (THREAD_MPI_PROFILING)
+
+# tmpi warnings for testing
+    option(THREAD_MPI_WARNINGS "Turn thread_mpi warnings for testing." OFF)
+    mark_as_advanced(THREAD_MPI_WARNINGS)
+    if (THREAD_MPI_WARNINGS)
+        set(TMPI_WARNINGS 1)
+    else (THREAD_MPI_WARNINGS)
+        set(TMPI_WARNINGS 0)
+    endif (THREAD_MPI_WARNINGS)
+
+    include(CheckCSourceCompiles)
+
+# affinity checks
+    include(CheckFunctionExists)
+    if (THREAD_PTHREADS)
+        set(CMAKE_REQUIRED_LIBRARIES ${CMAKE_THREAD_LIBS_INIT})
+        # check for sched_setaffinity
+        check_c_source_compiles(
+            "#define _GNU_SOURCE
+#include <pthread.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <errno.h>
+    int main(void) { cpu_set_t set;
+        CPU_ZERO(&set);
+        CPU_SET(0, &set);
+        pthread_setaffinity_np(pthread_self(), sizeof(set), &set);
+        return 0;
+    }"
+            PTHREAD_SETAFFINITY
+        )
+        if (PTHREAD_SETAFFINITY)
+            set(HAVE_PTHREAD_SETAFFINITY 1)
+        endif (PTHREAD_SETAFFINITY)
+        set(CMAKE_REQUIRED_LIBRARIES)
+    endif (THREAD_PTHREADS)
+
+
+# this runs on POSIX systems
+    check_include_files(unistd.h        HAVE_UNISTD_H)
+    check_include_files(sched.h         HAVE_SCHED_H)
+    check_include_files(sys/time.h      HAVE_SYS_TIME_H)
+    check_function_exists(sysconf       HAVE_SYSCONF)
+# this runs on windows
+#check_include_files(windows.h         HAVE_WINDOWS_H)
+ENDMACRO(TMPI_ENABLE)
+
+MACRO(TMPI_GET_SOURCE_LIST SRC_VARIABLE)
     set(${SRC_VARIABLE}
         thread_mpi/errhandler.c
-        thread_mpi/tmpi_malloc.c)
+        thread_mpi/tmpi_malloc.c
+        thread_mpi/atomic.c)
     if (THREAD_PTHREADS)
         list(APPEND ${SRC_VARIABLE} thread_mpi/pthreads.c)
     elseif (THREAD_WINDOWS)
@@ -46,7 +146,7 @@ MACRO(TMPI_GET_SOURCE_LIST SRC_VARIABLE)
     if (TMPI_CXX_LIB)
         list(APPEND ${SRC_VARIABLE} thread_mpi/system_error.cpp)
     endif (TMPI_CXX_LIB)
-    if (NOT TMPI_NO_MPI_LIB)
+    if (TMPI_ENABLED)
         list(APPEND ${SRC_VARIABLE}
              thread_mpi/alltoall.c      thread_mpi/p2p_protocol.c
              thread_mpi/barrier.c       thread_mpi/p2p_send_recv.c
@@ -63,92 +163,3 @@ MACRO(TMPI_GET_SOURCE_LIST SRC_VARIABLE)
     endif()
 ENDMACRO(TMPI_GET_SOURCE_LIST)
 
-test_tmpi_atomics(TMPI_ATOMICS)
-
-include(FindThreads)
-if (CMAKE_USE_PTHREADS_INIT)
-    check_include_files(pthread.h    HAVE_PTHREAD_H)
-    set(THREAD_PTHREADS 1)
-    #add_definitions(-DTHREAD_PTHREADS)
-    set(THREAD_LIB ${CMAKE_THREAD_LIBS_INIT})
-elseif (CMAKE_USE_WIN32_THREADS_INIT)
-    set(THREAD_WINDOWS 1)
-    #add_definitions(-DTHREAD_WINDOWS)
-    set(THREAD_LIB)
-else ()
-    message(FATAL_ERROR "Thread support required")
-endif (CMAKE_USE_PTHREADS_INIT)
-
-
-# the spin-waiting option
-option(THREAD_MPI_WAIT_FOR_NO_ONE "Use busy waits without yielding to the OS scheduler. Turning this on might improve performance (very) slightly at the cost of very poor performance if the threads are competing for CPU time." OFF)
-mark_as_advanced(THREAD_MPI_WAIT_FOR_NO_ONE)
-if (THREAD_MPI_WAIT_FOR_NO_ONE)
-    add_definitions(-DTMPI_WAIT_FOR_NO_ONE)
-else (THREAD_MPI_WAIT_FOR_NO_ONE)
-    add_definitions()
-endif (THREAD_MPI_WAIT_FOR_NO_ONE)
-
-
-# the copy buffer option
-option(THREAD_MPI_COPY_BUFFER "Use an intermediate copy buffer for small message sizes, to allow blocking sends to return quickly." ON)
-mark_as_advanced(THREAD_MPI_COPY_BUFFER)
-if (THREAD_MPI_COPY_BUFFER)
-    add_definitions()
-else (THREAD_MPI_COPY_BUFFER)
-    add_definitions(-DTMPI_NO_COPY_BUFFER)
-endif (THREAD_MPI_COPY_BUFFER)
-
-
-# the profiling option
-option(THREAD_MPI_PROFILING "Turn on simple MPI profiling." OFF)
-mark_as_advanced(THREAD_MPI_PROFILING)
-if (THREAD_MPI_PROFILING)
-    add_definitions(-DTMPI_PROFILE)
-else (THREAD_MPI_PROFILING)
-    add_definitions()
-endif (THREAD_MPI_PROFILING)
-
-include(CheckCSourceCompiles)
-
-# option to set affinity 
-option(THREAD_MPI_SET_AFFINITY "Set thread affinity to a core if number of threads equal to number of hardware threads." ON)
-mark_as_advanced(THREAD_MPI_SET_AFFINITY)
-if (THREAD_MPI_SET_AFFINITY)
-    add_definitions(-DTMPI_SET_AFFINITY)
-else (THREAD_MPI_SET_AFFINITY)
-    add_definitions()
-endif (THREAD_MPI_SET_AFFINITY)
-
-include(CheckFunctionExists)
-if (THREAD_PTHREADS)
-    set(CMAKE_REQUIRED_LIBRARIES ${CMAKE_THREAD_LIBS_INIT})
-    # check for sched_setaffinity
-    check_c_source_compiles(
-        "#define _GNU_SOURCE
-#include <pthread.h>
-#include <stdlib.h>
-#include <stdio.h>
-#include <errno.h>
-int main(void) { cpu_set_t set;
-    CPU_ZERO(&set);
-    CPU_SET(0, &set);
-    pthread_setaffinity_np(pthread_self(), sizeof(set), &set);
-    return 0;
-}"
-        PTHREAD_SETAFFINITY
-    )
-    if (PTHREAD_SETAFFINITY)
-        set(HAVE_PTHREAD_SETAFFINITY 1)
-    endif (PTHREAD_SETAFFINITY)
-    set(CMAKE_REQUIRED_LIBRARIES)
-endif (THREAD_PTHREADS)
-
-
-# this runs on POSIX systems
-check_include_files(unistd.h        HAVE_UNISTD_H)
-check_include_files(sched.h         HAVE_SCHED_H)
-check_include_files(sys/time.h      HAVE_SYS_TIME_H)
-check_function_exists(sysconf       HAVE_SYSCONF)
-# this runs on windows
-#check_include_files(windows.h         HAVE_WINDOWS_H)
diff --git a/cmake/Toolchain-Fujitsu-Sparc64-mpi.cmake b/cmake/Toolchain-Fujitsu-Sparc64-mpi.cmake
new file mode 100644 (file)
index 0000000..14c58b9
--- /dev/null
@@ -0,0 +1,55 @@
+#
+# This file is part of the GROMACS molecular simulation package.
+#
+# Copyright (c) 2012, by the GROMACS development team, led by
+# David van der Spoel, Berk Hess, Erik Lindahl, and including many
+# others, as listed in the AUTHORS file in the top-level source
+# directory and at http://www.gromacs.org.
+#
+# GROMACS is free software; you can redistribute it and/or
+# modify it under the terms of the GNU Lesser General Public License
+# as published by the Free Software Foundation; either version 2.1
+# of the License, or (at your option) any later version.
+#
+# GROMACS is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+# Lesser General Public License for more details.
+#
+# You should have received a copy of the GNU Lesser General Public
+# License along with GROMACS; if not, see
+# http://www.gnu.org/licenses, or write to the Free Software Foundation,
+# Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+#
+# If you want to redistribute modifications to GROMACS, please
+# consider that scientific software is very special. Version
+# control is crucial - bugs must be traceable. We will be happy to
+# consider code for inclusion in the official distribution, but
+# derived work must not be called official GROMACS. Details are found
+# in the README & COPYING files - if they are missing, get the
+# official version at http://www.gromacs.org.
+#
+# To help us fund GROMACS development, we humbly ask that you cite
+# the research papers on the package. Check out http://www.gromacs.org.
+#
+# the name of the target operating system
+set(CMAKE_SYSTEM_NAME Linux CACHE STRING "Cross-compiling for Fujitsu Sparc64")
+
+set_property(GLOBAL PROPERTY TARGET_SUPPORTS_SHARED_LIBS FALSE)
+
+# set the compiler
+set(CMAKE_C_COMPILER mpifccpx)
+set(CMAKE_CXX_COMPILER mpiFCCpx)
+set(CMAKE_C_COMPILER_ID "Fujitsu" CACHE STRING "Prevent CMake from adding GNU-specific linker flags (-rdynamic)" FORCE)
+
+set(CMAKE_C_FLAGS "-Kopenmp -Kfast,reduction,swp,simd=2,uxsimd -x500 -Xg -DGMX_RELAXED_DOUBLE_PRECISION -w" CACHE STRING "Fujitsu Sparc64 C Flags" FORCE)
+set(CMAKE_CXX_FLAGS "${CMAKE_C_FLAGS}" CACHE STRING "Fujitsu Sparc64 C++ Flags" FORCE)
+set(GMX_SOFTWARE_INVSQRT OFF CACHE BOOL "Use native 1.0/sqrt(x) on Fujitsu Sparc64" FORCE)
+
+set(GMX_THREAD_MPI OFF CACHE BOOL "Use real MPI instead" FORCE)
+set(GMX_MPI ON CACHE BOOL "Use MPI library" FORCE)
+set(GMX_DOUBLE ON CACHE BOOL "Use double by default on Fujitsu Sparc64 (due to HPC-ACE)" FORCE)
+set(GMX_GPU OFF CACHE BOOL "Cannot do GPU acceleration on Fujitsu Sparc64" FORCE)
+set(BUILD_SHARED_LIBS OFF CACHE BOOL "Use static linking by default on Fujitsu Sparc64" FORCE)
+
+set(GMX_CPU_ACCELERATION "Sparc64_HPC_ACE" CACHE STRING "Enabling Sparc64 HPC-ACE acceleration when using Fujitsu Sparc64 toolchain")
diff --git a/cmake/Toolchain-Fujitsu-Sparc64.cmake b/cmake/Toolchain-Fujitsu-Sparc64.cmake
new file mode 100644 (file)
index 0000000..c76c4d9
--- /dev/null
@@ -0,0 +1,54 @@
+#
+# This file is part of the GROMACS molecular simulation package.
+#
+# Copyright (c) 2012, by the GROMACS development team, led by
+# David van der Spoel, Berk Hess, Erik Lindahl, and including many
+# others, as listed in the AUTHORS file in the top-level source
+# directory and at http://www.gromacs.org.
+#
+# GROMACS is free software; you can redistribute it and/or
+# modify it under the terms of the GNU Lesser General Public License
+# as published by the Free Software Foundation; either version 2.1
+# of the License, or (at your option) any later version.
+#
+# GROMACS is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+# Lesser General Public License for more details.
+#
+# You should have received a copy of the GNU Lesser General Public
+# License along with GROMACS; if not, see
+# http://www.gnu.org/licenses, or write to the Free Software Foundation,
+# Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+#
+# If you want to redistribute modifications to GROMACS, please
+# consider that scientific software is very special. Version
+# control is crucial - bugs must be traceable. We will be happy to
+# consider code for inclusion in the official distribution, but
+# derived work must not be called official GROMACS. Details are found
+# in the README & COPYING files - if they are missing, get the
+# official version at http://www.gromacs.org.
+#
+# To help us fund GROMACS development, we humbly ask that you cite
+# the research papers on the package. Check out http://www.gromacs.org.
+#
+# the name of the target operating system
+set(CMAKE_SYSTEM_NAME Linux CACHE STRING "Cross-compiling for Fujitsu Sparc64")
+
+set_property(GLOBAL PROPERTY TARGET_SUPPORTS_SHARED_LIBS FALSE)
+
+# set the compiler
+set(CMAKE_C_COMPILER fccpx)
+set(CMAKE_CXX_COMPILER FCCpx)
+set(CMAKE_C_COMPILER_ID "Fujitsu" CACHE STRING "Prevent CMake from adding GNU-specific linker flags (-rdynamic)" FORCE)
+
+set(CMAKE_C_FLAGS "-Kopenmp -Kfast,reduction,swp,simd=2,uxsimd -x500 -Xg -DGMX_RELAXED_DOUBLE_PRECISION -w" CACHE STRING "Fujitsu Sparc64 C Flags" FORCE)
+set(CMAKE_CXX_FLAGS "${CMAKE_C_FLAGS}" CACHE STRING "Fujitsu Sparc64 C++ Flags" FORCE)
+set(GMX_SOFTWARE_INVSQRT OFF CACHE BOOL "Use native 1.0/sqrt(x) on Fujitsu Sparc64" FORCE)
+
+# By default CMake will use thread-mpi
+set(GMX_DOUBLE ON CACHE BOOL "Use double by default on Fujitsu Sparc64 (due to HPC-ACE)" FORCE)
+set(GMX_GPU OFF CACHE BOOL "Cannot do GPU acceleration on Fujitsu Sparc64" FORCE)
+set(BUILD_SHARED_LIBS OFF CACHE BOOL "Use static linking by default on Fujitsu Sparc64" FORCE)
+
+set(GMX_CPU_ACCELERATION "Sparc64_HPC_ACE" CACHE STRING "Enabling Sparc64 HPC-ACE acceleration when using Fujitsu Sparc64 toolchain")
index dbfc40081aeba82eb721a90ba4880945ccfa2248..0fd56868e79cfc6c103d57103d94e6fff9ced041 100644 (file)
@@ -708,7 +708,7 @@ as it is (nearly) free; selects <b>None</b> with the group cutoff-scheme.</dd>
 This makes the potential the integral of the force. Note that this does not
 affect the forces or the sampling.</dd>
 <dt><b>None</b></dt>
-<dd>Use an unmodified Coulomb potential.</dd>
+<dd>Use an unmodified Coulomb potential. With the group scheme this means no exact cut-off is used, energies and forces are calculated for all pairs in the neighborlist.</dd>
 </dl></dd>
 
 
@@ -781,7 +781,7 @@ as it is (nearly) free; selects <b>None</b> with the group cutoff-scheme.</dd>
 This makes the potential the integral of the force. Note that this does not
 affect the forces or the sampling.</dd>
 <dt><b>None</b></dt>
-<dd>Use an unmodified Van der Waals potential.</dd>
+<dd>Use an unmodified Van der Waals potential. With the group scheme this means no exact cut-off is used, energies and forces are calculated for all pairs in the neighborlist.</dd>
 </dl></dd>
 
 <dt><b>rvdw-switch: (0) [nm]</b></dt>
index 2c003ba0245230b628ef7f216721b50d88d83f15..619ee23fba54238f3b2b29bee9c70ba72cea7761 100644 (file)
 /* IBM QPX was selected as CPU acceleration type (e.g. BlueGene/Q) */
 #cmakedefine GMX_CPU_ACCELERATION_IBM_QPX
 
+/* Fujitsu Sparc64 HPC-ACE SIMD acceleration */
+#cmakedefine GMX_CPU_ACCELERATION_SPARC64_HPC_ACE
+
 /* String for CPU acceleration choice (for writing to log files and stdout) */
 #define GMX_CPU_ACCELERATION_STRING "@GMX_CPU_ACCELERATION@"
 
 /* Define when Windows threads are used */
 #cmakedefine THREAD_WINDOWS
 
-/* Define when thread-MPI atomic operations are available */
+/* Define when there is a pthread.h */
+#cmakedefine HAVE_PTHREAD_H
+
+/* Define native atomic operations are found */
 #cmakedefine TMPI_ATOMICS
 
 /* Define for busy wait option  */
 /* Define for copy buffer option */
 #cmakedefine TMPI_COPY_BUFFER
 
+/* Define for tmpi warnings option */
+#cmakedefine TMPI_WARNINGS
+
 /* Define for profiling option */
 #cmakedefine TMPI_PROFILE
 
-/* Define for Linux pthread_setaffinity */
+/* Define for Linux pthread_setaffinity_np */
 #cmakedefine HAVE_PTHREAD_SETAFFINITY
 
-/* Define for sysconf() */
-#cmakedefine HAVE_SYSCONF
+/* Define for Windows NUMA-aware allocator functions*/
+#cmakedefine TMPI_WINDOWS_NUMA_API
+
+/* Define for GetSystemInfo() */
+#cmakedefine HAVE_SYSTEM_INFO
 
 /* Enable x86 gcc inline assembly */
 #cmakedefine GMX_X86_GCC_INLINE_ASM
index 122d3d65933c9c7f410d5b74131a37fe3633681c..4a24ea203801a73703171f44fb694171a036a3bf 100644 (file)
 #include "mtop_util.h"
 #include "gmx_ana.h"
 
+static int greatest_common_divisor(int p, int q)
+{
+    int tmp;
+    while (q != 0)
+    {
+        tmp = q;
+        q = p % q;
+        p = tmp;
+    }
+    return p;
+}
+
 static void insert_ion(int nsa, int *nwater,
                        gmx_bool bSet[], int repl[], atom_id index[],
                        real pot[], rvec x[], t_pbc *pbc,
@@ -430,8 +442,7 @@ int gmx_genion(int argc, char *argv[])
         { "-scale", FALSE, etREAL, {&scale}, "Scaling factor for the potential for [TT]-pot[tt]" },
         { "-conc",  FALSE, etREAL, {&conc},
           "Specify salt concentration (mol/liter). This will add sufficient ions to reach up to the specified concentration as computed from the volume of the cell in the input [TT].tpr[tt] file. Overrides the [TT]-np[tt] and [TT]-nn[tt] options." },
-        { "-neutral", FALSE, etBOOL, {&bNeutral},
-          "This option will add enough ions to neutralize the system. In combination with the concentration option a neutral system at a given salt concentration will be generated." }
+        { "-neutral", FALSE, etBOOL, {&bNeutral}, "This option will add enough ions to neutralize the system. These ions are added on top of those specified with [TT]-np[tt]/[TT]-nn[tt] or [TT]-conc[tt]. "}
     };
     gmx_mtop_t        *mtop;
     gmx_localtop_t    *top;
@@ -494,33 +505,39 @@ int gmx_genion(int argc, char *argv[])
     }
     iqtot = gmx_nint(qtot);
 
-    if ((conc > 0) || bNeutral)
+    
+    if (conc > 0)
     {
         /* Compute number of ions to be added */
         vol = det(box);
-        if (conc > 0)
+        nsalt = gmx_nint(conc*vol*AVOGADRO/1e24);
+        p_num = abs(nsalt*n_q);
+        n_num = abs(nsalt*p_q);
+    }
+    if (bNeutral)
+    {
+        int qdelta = p_num*p_q + n_num*n_q + iqtot;
+
+        /* Check if the system is neutralizable
+         * is (qdelta == p_q*p_num + n_q*n_num) solvable for p_num and n_num? */
+        int gcd = greatest_common_divisor(n_q, p_q);
+        if ((qdelta % gcd) != 0)
+        {
+            gmx_fatal(FARGS, "Can't neutralize this system using -nq %d and"
+                    " -pq %d.\n", n_q, p_q);
+        }
+        
+        while (qdelta != 0)
         {
-            nsalt = gmx_nint(conc*vol*AVOGADRO/1e24);
-            p_num = abs(nsalt*n_q);
-            n_num = abs(nsalt*p_q);
-            if (bNeutral)
+            while (qdelta < 0)
             {
-                int qdelta = 0;
-                do
-                {
-                    qdelta = (p_num*p_q + n_num*n_q + iqtot);
-                    if (qdelta < 0)
-                    {
-                        p_num  += abs(qdelta/p_q);
-                        qdelta  = (p_num*p_q + n_num*n_q + iqtot);
-                    }
-                    if (qdelta > 0)
-                    {
-                        n_num  += abs(qdelta/n_q);
-                        qdelta  = (p_num*p_q + n_num*n_q + iqtot);
-                    }
-                }
-                while (qdelta != 0);
+                p_num++;
+                qdelta += p_q;
+            }
+            while (qdelta > 0)
+            {
+                n_num++;
+                qdelta += n_q;
             }
         }
     }
index 15b8c561bfe5b91ef31cf4e4dd9dfba33a5b07c3..93a2e2718157133a09065f6a6da3eb32ec08993c 100644 (file)
  * in a single file, but to avoid repeated ifdefs we set the overall architecture here.
  */
 #if defined (__i386__) || defined (__x86_64__) || defined (_M_IX86) || defined (_M_X64)
+/* OK, it is x86, but can we execute cpuid? */
+#if defined(GMX_X86_GCC_INLINE_ASM) || ( defined(_MSC_VER) && ( (_MSC_VER > 1500) || (_MSC_VER==1500 & _MSC_FULL_VER >= 150030729)))
 #    define GMX_CPUID_X86
 #endif
+#endif
 
 /* Global constant character strings corresponding to our enumerated types */
 const char *
@@ -60,7 +63,9 @@ gmx_cpuid_vendor_string[GMX_CPUID_NVENDORS] =
     "CannotDetect",
     "Unknown",
     "GenuineIntel",
-    "AuthenticAMD"
+    "AuthenticAMD",
+    "Fujitsu",
+    "IBM"
 };
 
 const char *
@@ -111,7 +116,8 @@ gmx_cpuid_acceleration_string[GMX_CPUID_NACCELERATIONS] =
     "SSE2",
     "SSE4.1",
     "AVX_128_FMA",
-    "AVX_256"
+    "AVX_256",
+    "Sparc64 HPC-ACE"
 };
 
 /* Max length of brand string */
@@ -209,6 +215,10 @@ enum gmx_cpuid_acceleration
 static const
 enum gmx_cpuid_acceleration
     compiled_acc = GMX_CPUID_ACCELERATION_X86_SSE2;
+#elif defined GMX_CPU_ACCELERATION_SPARC64_HPC_ACE
+static const
+enum gmx_cpuid_acceleration
+    compiled_acc = GMX_CPUID_ACCELERATION_SPARC64_HPC_ACE;
 #else
 static const
 enum gmx_cpuid_acceleration
@@ -682,6 +692,48 @@ cpuid_check_intel_x86(gmx_cpuid_t                cpuid)
 
 
 
+
+static void
+chomp_substring_before_colon(const char *in, char *s, int maxlength)
+{
+    char *p;
+    strncpy(s,in,maxlength);
+    p = strchr(s,':');
+    if(p!=NULL)
+    {
+        *p='\0';
+        while(isspace(*(--p)) && (p>=s))
+        {
+            *p='\0';
+        }
+    }
+    else
+    {
+        *s='\0';
+    }
+}
+
+static void
+chomp_substring_after_colon(const char *in, char *s, int maxlength)
+{
+    char *p;
+    if( (p = strchr(in,':'))!=NULL)
+    {
+        p++;
+        while(isspace(*p)) p++;
+        strncpy(s,p,maxlength);
+        p = s+strlen(s);
+        while(isspace(*(--p)) && (p>=s))
+        {
+            *p='\0';
+        }
+    }
+    else
+    {
+        *s='\0';
+    }
+}
+
 /* Try to find the vendor of the current CPU, so we know what specific
  * detection routine to call.
  */
@@ -692,6 +744,8 @@ cpuid_check_vendor(void)
     /* Register data used on x86 */
     unsigned int               eax, ebx, ecx, edx;
     char                       vendorstring[13];
+    FILE *                     fp;
+    char                       buffer[255],buffer2[255];
 
     /* Set default first */
     vendor = GMX_CPUID_VENDOR_UNKNOWN;
@@ -712,6 +766,29 @@ cpuid_check_vendor(void)
             vendor = i;
         }
     }
+#elif defined(__linux__) || defined(__linux)
+    /* General Linux. Try to get CPU vendor from /proc/cpuinfo */
+    if( (fp = fopen("/proc/cpuinfo","r")) != NULL)
+    {
+        while( (vendor == GMX_CPUID_VENDOR_UNKNOWN) && (fgets(buffer,sizeof(buffer),fp) != NULL))
+        {
+            chomp_substring_before_colon(buffer,buffer2,sizeof(buffer2));
+            /* Intel/AMD use "vendor_id", IBM "vendor". Fujitsu "manufacture". Add others if you have them! */
+            if( !strcmp(buffer2,"vendor_id") || !strcmp(buffer2,"vendor") || !strcmp(buffer2,"manufacture") )
+            {
+                chomp_substring_after_colon(buffer,buffer2,sizeof(buffer2));
+                for(i=GMX_CPUID_VENDOR_UNKNOWN; i<GMX_CPUID_NVENDORS; i++)
+                {
+                    /* Be liberal and accept if we find the vendor anywhere in string */
+                    if(strstr(buffer2,gmx_cpuid_vendor_string[i]))
+                    {
+                        vendor = i;
+                    }
+                }
+            }
+        }
+    }
+    fclose(fp);
 #else
     vendor = GMX_CPUID_VENDOR_UNKNOWN;
 #endif
@@ -780,6 +857,9 @@ gmx_cpuid_init               (gmx_cpuid_t *              pcpuid)
 {
     gmx_cpuid_t cpuid;
     int         i;
+    FILE *      fp;
+    char        buffer[255],buffer2[255];
+    int         found_brand;
 
     cpuid = malloc(sizeof(*cpuid));
 
@@ -789,6 +869,7 @@ gmx_cpuid_init               (gmx_cpuid_t *              pcpuid)
     {
         cpuid->feature[i] = 0;
     }
+
     cpuid->have_cpu_topology   = 0;
     cpuid->nproc               = 0;
     cpuid->npackages           = 0;
@@ -812,20 +893,37 @@ gmx_cpuid_init               (gmx_cpuid_t *              pcpuid)
             break;
 #endif
         default:
-            /* Could not find vendor */
-            strncpy(cpuid->brand, "Unknown CPU brand", GMX_CPUID_BRAND_MAXLEN);
+            /* Default value */
+            strncpy(cpuid->brand,"Unknown CPU brand",GMX_CPUID_BRAND_MAXLEN);
+#if defined(__linux__) || defined(__linux)
+            /* General Linux. Try to get CPU type from /proc/cpuinfo */
+            if( (fp = fopen("/proc/cpuinfo","r")) != NULL)
+            {
+                found_brand = 0;
+                while( (found_brand==0) && (fgets(buffer,sizeof(buffer),fp) !=NULL))
+                {
+                    chomp_substring_before_colon(buffer,buffer2,sizeof(buffer2));
+                    /* Intel uses "model name", Fujitsu and IBM "cpu". */
+                    if( !strcmp(buffer2,"model name") || !strcmp(buffer2,"cpu"))
+                    {
+                        chomp_substring_after_colon(buffer,cpuid->brand,GMX_CPUID_BRAND_MAXLEN);
+                        found_brand = 1;
+                    }
+                }
+            }
+            fclose(fp);
+#endif
             cpuid->family         = 0;
             cpuid->model          = 0;
             cpuid->stepping       = 0;
-
-            for (i = 0; i < GMX_CPUID_NFEATURES; i++)
+            
+            for(i=0; i<GMX_CPUID_NFEATURES; i++)
             {
-                cpuid->feature[i] = 0;
+                cpuid->feature[i]=0;
             }
             cpuid->feature[GMX_CPUID_FEATURE_CANNOTDETECT] = 1;
             break;
     }
-
     return 0;
 }
 
@@ -936,7 +1034,13 @@ gmx_cpuid_acceleration_suggest  (gmx_cpuid_t                 cpuid)
             tmpacc = GMX_CPUID_ACCELERATION_X86_SSE2;
         }
     }
-
+    else if(gmx_cpuid_vendor(cpuid)==GMX_CPUID_VENDOR_FUJITSU)
+    {
+        if(strstr(gmx_cpuid_brand(cpuid),"SPARC64"))
+        {
+            tmpacc = GMX_CPUID_ACCELERATION_SPARC64_HPC_ACE;
+        }
+    }
     return tmpacc;
 }
 
@@ -987,7 +1091,6 @@ gmx_cpuid_acceleration_check(gmx_cpuid_t   cpuid,
 }
 
 
-
 #ifdef GMX_CPUID_STANDALONE
 /* Stand-alone program to enable queries of CPU features from Cmake.
  * Note that you need to check inline ASM capabilities before compiling and set
index 5fcbe2da7f71d7c759415118c4a757961f2671f3..1787371a5f37fd122c0027ace84965034af85c4f 100644 (file)
@@ -1,40 +1,45 @@
 # Sources that should always be built
 file(GLOB NONBONDED_SOURCES *.c nb_kernel_c/*.c)
 
-if(GMX_CPU_ACCELERATION STREQUAL "SSE2" AND NOT GMX_DOUBLE)
+if("${GMX_CPU_ACCELERATION}" STREQUAL "SSE2" AND NOT GMX_DOUBLE)
     file(GLOB NONBONDED_SSE2_SINGLE_SOURCES nb_kernel_sse2_single/*.c)
 endif()
 
-if(GMX_CPU_ACCELERATION STREQUAL "SSE4.1" AND NOT GMX_DOUBLE)
+if("${GMX_CPU_ACCELERATION}" STREQUAL "SSE4.1" AND NOT GMX_DOUBLE)
     file(GLOB NONBONDED_SSE4_1_SINGLE_SOURCES nb_kernel_sse4_1_single/*.c)
 endif()
 
-if(GMX_CPU_ACCELERATION STREQUAL "AVX_128_FMA" AND NOT GMX_DOUBLE)
+if("${GMX_CPU_ACCELERATION}" STREQUAL "AVX_128_FMA" AND NOT GMX_DOUBLE)
     file(GLOB NONBONDED_AVX_128_FMA_SINGLE_SOURCES nb_kernel_avx_128_fma_single/*.c)
 endif()
 
-if(GMX_CPU_ACCELERATION STREQUAL "AVX_256" AND NOT GMX_DOUBLE)
+if("${GMX_CPU_ACCELERATION}" STREQUAL "AVX_256" AND NOT GMX_DOUBLE)
     file(GLOB NONBONDED_AVX_256_SINGLE_SOURCES nb_kernel_avx_256_single/*.c)
 endif()
 
-if(GMX_CPU_ACCELERATION STREQUAL "SSE2" AND GMX_DOUBLE)
+if("${GMX_CPU_ACCELERATION}" STREQUAL "SSE2" AND GMX_DOUBLE)
     file(GLOB NONBONDED_SSE2_DOUBLE_SOURCES nb_kernel_sse2_double/*.c)
 endif()
 
-if(GMX_CPU_ACCELERATION STREQUAL "SSE4.1" AND GMX_DOUBLE)
+if("${GMX_CPU_ACCELERATION}" STREQUAL "SSE4.1" AND GMX_DOUBLE)
     file(GLOB NONBONDED_SSE4_1_DOUBLE_SOURCES nb_kernel_sse4_1_double/*.c)
 endif()
 
-if(GMX_CPU_ACCELERATION STREQUAL "AVX_128_FMA" AND GMX_DOUBLE)
+if("${GMX_CPU_ACCELERATION}" STREQUAL "AVX_128_FMA" AND GMX_DOUBLE)
     file(GLOB NONBONDED_AVX_128_FMA_DOUBLE_SOURCES nb_kernel_avx_128_fma_double/*.c)
 endif()
 
-if(GMX_CPU_ACCELERATION STREQUAL "AVX_256" AND GMX_DOUBLE)
+if("${GMX_CPU_ACCELERATION}" STREQUAL "AVX_256" AND GMX_DOUBLE)
     file(GLOB NONBONDED_AVX_256_DOUBLE_SOURCES nb_kernel_avx_256_double/*.c)
 endif()
 
+if("${GMX_CPU_ACCELERATION}" STREQUAL "Sparc64_HPC_ACE" AND GMX_DOUBLE)
+    file(GLOB NONBONDED_SPARC64_HPC_ACE_DOUBLE_SOURCES nb_kernel_sparc64_hpc_ace_double/*.c)
+endif()
+
+
 # These sources will be used in the parent directory's CMakeLists.txt
-set(NONBONDED_SOURCES ${NONBONDED_SOURCES} ${NONBONDED_SSE2_SINGLE_SOURCES} ${NONBONDED_SSE4_1_SINGLE_SOURCES} ${NONBONDED_AVX_128_FMA_SINGLE_SOURCES} ${NONBONDED_AVX_256_SINGLE_SOURCES} ${NONBONDED_SSE2_DOUBLE_SOURCES} ${NONBONDED_SSE4_1_DOUBLE_SOURCES} ${NONBONDED_AVX_128_FMA_DOUBLE_SOURCES} ${NONBONDED_AVX_256_DOUBLE_SOURCES} PARENT_SCOPE)
+set(NONBONDED_SOURCES ${NONBONDED_SOURCES} ${NONBONDED_SSE2_SINGLE_SOURCES} ${NONBONDED_SSE4_1_SINGLE_SOURCES} ${NONBONDED_AVX_128_FMA_SINGLE_SOURCES} ${NONBONDED_AVX_256_SINGLE_SOURCES} ${NONBONDED_SSE2_DOUBLE_SOURCES} ${NONBONDED_SSE4_1_DOUBLE_SOURCES} ${NONBONDED_AVX_128_FMA_DOUBLE_SOURCES} ${NONBONDED_AVX_256_DOUBLE_SOURCES} ${NONBONDED_SPARC64_HPC_ACE_DOUBLE_SOURCES} PARENT_SCOPE)
 
 
 
diff --git a/src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/kernelutil_sparc64_hpc_ace_double.h b/src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/kernelutil_sparc64_hpc_ace_double.h
new file mode 100644 (file)
index 0000000..dfd3839
--- /dev/null
@@ -0,0 +1,945 @@
+/*
+ *                This source code is part of
+ *
+ *                 G   R   O   M   A   C   S
+ *
+ * Copyright (c) 2011-2012, The GROMACS Development Team
+ *
+ * Gromacs is a library for molecular simulation and trajectory analysis,
+ * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
+ * a full list of developers and information, check out http://www.gromacs.org
+ *
+ * This program is free software; you can redistribute it and/or modify it under 
+ * the terms of the GNU Lesser General Public License as published by the Free 
+ * Software Foundation; either version 2 of the License, or (at your option) any 
+ * later version.
+ * As a special exception, you may use this file as part of a free software
+ * library without restriction.  Specifically, if other files instantiate
+ * templates or use macros or inline functions from this file, or you compile
+ * this file and link it with other files to produce an executable, this
+ * file does not by itself cause the resulting executable to be covered by
+ * the GNU Lesser General Public License.
+ *
+ * In plain-speak: do not worry about classes/macros/templates either - only
+ * changes to the library have to be LGPL, not an application linking with it.
+ *
+ * To help fund GROMACS development, we humbly ask that you cite
+ * the papers people have written on it - you can find them on the website!
+ */
+#ifndef _kernelutil_sparc64_hpc_ace_double_h_
+#define _kernelutil_sparc64_hpc_ace_double_h_
+
+/* Fujitsu header borrows the name from SSE2, since some instructions have aliases */
+#include "emmintrin.h"
+
+#define GMX_FJSP_SHUFFLE2(x,y) (((x)<<1) | (y))
+
+#define GMX_FJSP_TRANSPOSE2_V2R8(row0, row1) {           \
+    _fjsp_v2r8 __gmx_t1 = row0;                          \
+    row0           = _fjsp_unpacklo_v2r8(row0,row1);     \
+    row1           = _fjsp_unpackhi_v2r8(__gmx_t1,row1); \
+}
+
+
+static void
+gmx_fjsp_print_v2r8(const char *s, _fjsp_v2r8 a)
+{
+  double lo,hi;
+
+  _fjsp_storel_v2r8(&lo,a);
+  _fjsp_storeh_v2r8(&hi,a);
+  printf("%s: %g %g\n",s,lo,hi);
+}
+
+
+static _fjsp_v2r8
+gmx_fjsp_set1_v2r8(double d)
+{
+    return _fjsp_set_v2r8(d,d);
+}
+
+static _fjsp_v2r8
+gmx_fjsp_load1_v2r8(const double * gmx_restrict ptr)
+{
+    return gmx_fjsp_set1_v2r8(*ptr);
+}
+
+
+static int
+gmx_fjsp_any_lt_v2r8(_fjsp_v2r8 a, _fjsp_v2r8 b)
+{
+    union
+    {
+        double           d;
+        long long int    i;
+    }
+    conv;
+    
+    a = _fjsp_cmplt_v2r8(a,b);
+    a = _fjsp_or_v2r8(a, _fjsp_unpackhi_v2r8(a,a));
+    _fjsp_storel_v2r8(&(conv.d),a);
+    return (conv.i != 0);
+}
+
+/* 1.0/sqrt(x) */
+static gmx_inline _fjsp_v2r8
+gmx_fjsp_invsqrt_v2r8(_fjsp_v2r8 x)
+{
+    const _fjsp_v2r8 half  = gmx_fjsp_set1_v2r8(0.5);
+    const _fjsp_v2r8 three = gmx_fjsp_set1_v2r8(3.0);
+    _fjsp_v2r8 lu = _fjsp_rsqrta_v2r8(x);
+    
+    lu = _fjsp_mul_v2r8(_fjsp_mul_v2r8(half,lu),_fjsp_nmsub_v2r8(_fjsp_mul_v2r8(lu,lu),x,three));
+    /* The HPC-ACE instruction set is only available in double precision, while
+     * single precision is typically sufficient for Gromacs. If you define 
+     * "GMX_RELAXED_DOUBLE_PRECISION" during compile, we stick to two Newton-Raphson 
+     * iterations and accept 32bits of accuracy in 1.0/sqrt(x) and 1.0/x, rather than full 
+     * double precision (53 bits). This is still clearly higher than single precision (24 bits).
+     */
+#ifndef GMX_RELAXED_DOUBLE_PRECISION
+    lu = _fjsp_mul_v2r8(_fjsp_mul_v2r8(half,lu),_fjsp_nmsub_v2r8(_fjsp_mul_v2r8(lu,lu),x,three));
+#endif
+    return _fjsp_mul_v2r8(_fjsp_mul_v2r8(half,lu),_fjsp_nmsub_v2r8(_fjsp_mul_v2r8(lu,lu),x,three));
+}
+
+
+/* 1.0/x */
+static gmx_inline _fjsp_v2r8
+gmx_fjsp_inv_v2r8(_fjsp_v2r8 x)
+{
+    const _fjsp_v2r8 two  = gmx_fjsp_set1_v2r8(2.0);    
+    __m128d lu = _fjsp_rcpa_v2r8(x);
+    
+    /* Perform three N-R steps for double precision */
+    lu         = _fjsp_mul_v2r8(lu,_fjsp_nmsub_v2r8(lu,x,two));
+    /* The HPC-ACE instruction set is only available in double precision, while
+     * single precision is typically sufficient for Gromacs. If you define
+     * "GMX_RELAXED_DOUBLE_PRECISION" during compile, we stick to two Newton-Raphson
+     * iterations and accept 32bits of accuracy in 1.0/sqrt(x) and 1.0/x, rather than full
+     * double precision (53 bits). This is still clearly higher than single precision (24 bits).
+     */
+#ifndef GMX_RELAXED_DOUBLE_PRECISION
+    lu         = _fjsp_mul_v2r8(lu,_fjsp_nmsub_v2r8(lu,x,two));
+#endif
+    return _fjsp_mul_v2r8(lu,_fjsp_nmsub_v2r8(lu,x,two));
+}
+
+
+static gmx_inline _fjsp_v2r8
+gmx_fjsp_calc_rsq_v2r8(_fjsp_v2r8 dx, _fjsp_v2r8 dy, _fjsp_v2r8 dz)
+{
+    return _fjsp_madd_v2r8(dx,dx,_fjsp_madd_v2r8(dy,dy,_fjsp_mul_v2r8(dz,dz)));
+}
+
+/* Normal sum of four ymm registers */
+#define gmx_fjsp_sum4_v2r8(t0,t1,t2,t3)  _fjsp_add_v2r8(_fjsp_add_v2r8(t0,t1),_fjsp_add_v2r8(t2,t3))
+
+
+
+
+
+static _fjsp_v2r8
+gmx_fjsp_load_2real_swizzle_v2r8(const double * gmx_restrict ptrA,
+                                 const double * gmx_restrict ptrB)
+{
+    return _fjsp_unpacklo_v2r8(_fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),ptrA),_fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),ptrB));
+}
+
+static _fjsp_v2r8
+gmx_fjsp_load_1real_v2r8(const double * gmx_restrict ptrA)
+{
+    return _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),ptrA);
+}
+
+
+static void
+gmx_fjsp_store_2real_swizzle_v2r8(double * gmx_restrict ptrA,
+                                double * gmx_restrict ptrB,
+                                _fjsp_v2r8 xmm1)
+{
+    _fjsp_v2r8 t2;
+    
+    t2       = _fjsp_unpackhi_v2r8(xmm1,xmm1);
+    _fjsp_storel_v2r8(ptrA,xmm1);                                           
+    _fjsp_storel_v2r8(ptrB,t2);                                         
+}
+
+static void
+gmx_fjsp_store_1real_v2r8(double * gmx_restrict ptrA, _fjsp_v2r8 xmm1)
+{
+    _fjsp_storel_v2r8(ptrA,xmm1);
+}
+
+
+/* Similar to store, but increments value in memory */
+static void
+gmx_fjsp_increment_2real_swizzle_v2r8(double * gmx_restrict ptrA,
+                                    double * gmx_restrict ptrB, _fjsp_v2r8 xmm1)
+{
+    _fjsp_v2r8 t1;
+    
+    t1   = _fjsp_unpackhi_v2r8(xmm1,xmm1);
+    xmm1 = _fjsp_add_v2r8(xmm1,_fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),ptrA));
+    t1   = _fjsp_add_v2r8(t1,_fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),ptrB));
+    _fjsp_storel_v2r8(ptrA,xmm1);
+    _fjsp_storel_v2r8(ptrB,t1);
+}
+
+static void
+gmx_fjsp_increment_1real_v2r8(double * gmx_restrict ptrA, _fjsp_v2r8 xmm1)
+{
+    _fjsp_v2r8 tmp;
+    
+    tmp = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),ptrA);
+    tmp = _fjsp_add_v2r8(tmp,xmm1);
+    _fjsp_storel_v2r8(ptrA,tmp);
+}
+
+
+
+static gmx_inline void
+gmx_fjsp_load_2pair_swizzle_v2r8(const double * gmx_restrict p1,
+                             const double * gmx_restrict p2,
+                             _fjsp_v2r8 * gmx_restrict c6,
+                             _fjsp_v2r8 * gmx_restrict c12)
+{
+    _fjsp_v2r8 t1,t2,t3;
+    
+    /* The c6/c12 array should be aligned */
+    t1   = _fjsp_load_v2r8(p1);
+    t2   = _fjsp_load_v2r8(p2);
+    *c6  = _fjsp_unpacklo_v2r8(t1,t2);  
+    *c12 = _fjsp_unpackhi_v2r8(t1,t2);                    
+}
+
+static gmx_inline void
+gmx_fjsp_load_1pair_swizzle_v2r8(const double * gmx_restrict p1,
+                             _fjsp_v2r8 * gmx_restrict c6,
+                             _fjsp_v2r8 * gmx_restrict c12)
+{
+    *c6     = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),p1);
+    *c12    = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),p1+1);
+}
+
+
+static gmx_inline void
+gmx_fjsp_load_shift_and_1rvec_broadcast_v2r8(const double * gmx_restrict xyz_shift,
+                                         const double * gmx_restrict xyz,
+                                         _fjsp_v2r8 * gmx_restrict x1,
+                                         _fjsp_v2r8 * gmx_restrict y1,
+                                         _fjsp_v2r8 * gmx_restrict z1)
+{
+    _fjsp_v2r8 mem_xy,mem_z,mem_sxy,mem_sz;
+    
+    mem_xy  = _fjsp_load_v2r8(xyz);
+    mem_z   = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),xyz+2);
+    mem_sxy = _fjsp_load_v2r8(xyz_shift);
+    mem_sz  = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),xyz_shift+2);
+    
+    mem_xy  = _fjsp_add_v2r8(mem_xy,mem_sxy);
+    mem_z   = _fjsp_add_v2r8(mem_z,mem_sz);
+    
+    *x1  = _fjsp_shuffle_v2r8(mem_xy,mem_xy,GMX_FJSP_SHUFFLE2(0,0));
+    *y1  = _fjsp_shuffle_v2r8(mem_xy,mem_xy,GMX_FJSP_SHUFFLE2(1,1));
+    *z1  = _fjsp_shuffle_v2r8(mem_z,mem_z,GMX_FJSP_SHUFFLE2(0,0));
+}
+
+
+static gmx_inline void
+gmx_fjsp_load_shift_and_3rvec_broadcast_v2r8(const double * gmx_restrict xyz_shift,
+                                         const double * gmx_restrict xyz,
+                                         _fjsp_v2r8 * gmx_restrict x1, _fjsp_v2r8 * gmx_restrict y1, _fjsp_v2r8 * gmx_restrict z1,
+                                         _fjsp_v2r8 * gmx_restrict x2, _fjsp_v2r8 * gmx_restrict y2, _fjsp_v2r8 * gmx_restrict z2,
+                                         _fjsp_v2r8 * gmx_restrict x3, _fjsp_v2r8 * gmx_restrict y3, _fjsp_v2r8 * gmx_restrict z3)
+{
+    _fjsp_v2r8 t1,t2,t3,t4,t5,sxy,sz,szx,syz;
+    
+    t1  = _fjsp_load_v2r8(xyz);
+    t2  = _fjsp_load_v2r8(xyz+2);
+    t3  = _fjsp_load_v2r8(xyz+4);
+    t4  = _fjsp_load_v2r8(xyz+6);
+    t5  = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),xyz+8);
+    
+    sxy = _fjsp_load_v2r8(xyz_shift);
+    sz  = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),xyz_shift+2);
+    szx = _fjsp_shuffle_v2r8(sz,sxy,GMX_FJSP_SHUFFLE2(0,0));
+    syz = _fjsp_shuffle_v2r8(sxy,sz,GMX_FJSP_SHUFFLE2(0,1));
+    
+    t1  = _fjsp_add_v2r8(t1,sxy);
+    t2  = _fjsp_add_v2r8(t2,szx);
+    t3  = _fjsp_add_v2r8(t3,syz);
+    t4  = _fjsp_add_v2r8(t4,sxy);
+    t5  = _fjsp_add_v2r8(t5,sz);
+    
+    *x1  = _fjsp_shuffle_v2r8(t1,t1,GMX_FJSP_SHUFFLE2(0,0));
+    *y1  = _fjsp_shuffle_v2r8(t1,t1,GMX_FJSP_SHUFFLE2(1,1));
+    *z1  = _fjsp_shuffle_v2r8(t2,t2,GMX_FJSP_SHUFFLE2(0,0));
+    *x2  = _fjsp_shuffle_v2r8(t2,t2,GMX_FJSP_SHUFFLE2(1,1));
+    *y2  = _fjsp_shuffle_v2r8(t3,t3,GMX_FJSP_SHUFFLE2(0,0));
+    *z2  = _fjsp_shuffle_v2r8(t3,t3,GMX_FJSP_SHUFFLE2(1,1));
+    *x3  = _fjsp_shuffle_v2r8(t4,t4,GMX_FJSP_SHUFFLE2(0,0));
+    *y3  = _fjsp_shuffle_v2r8(t4,t4,GMX_FJSP_SHUFFLE2(1,1));
+    *z3  = _fjsp_shuffle_v2r8(t5,t5,GMX_FJSP_SHUFFLE2(0,0));
+}
+
+
+static gmx_inline void
+gmx_fjsp_load_shift_and_4rvec_broadcast_v2r8(const double * gmx_restrict xyz_shift,
+                                         const double * gmx_restrict xyz,
+                                         _fjsp_v2r8 * gmx_restrict x1, _fjsp_v2r8 * gmx_restrict y1, _fjsp_v2r8 * gmx_restrict z1,
+                                         _fjsp_v2r8 * gmx_restrict x2, _fjsp_v2r8 * gmx_restrict y2, _fjsp_v2r8 * gmx_restrict z2,
+                                         _fjsp_v2r8 * gmx_restrict x3, _fjsp_v2r8 * gmx_restrict y3, _fjsp_v2r8 * gmx_restrict z3,
+                                         _fjsp_v2r8 * gmx_restrict x4, _fjsp_v2r8 * gmx_restrict y4, _fjsp_v2r8 * gmx_restrict z4)
+{
+    _fjsp_v2r8 t1,t2,t3,t4,t5,t6,sxy,sz,szx,syz;
+    
+    t1  = _fjsp_load_v2r8(xyz);
+    t2  = _fjsp_load_v2r8(xyz+2);
+    t3  = _fjsp_load_v2r8(xyz+4);
+    t4  = _fjsp_load_v2r8(xyz+6);
+    t5  = _fjsp_load_v2r8(xyz+8);
+    t6  = _fjsp_load_v2r8(xyz+10);
+    
+    sxy = _fjsp_load_v2r8(xyz_shift);
+    sz  = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),xyz_shift+2);
+    szx = _fjsp_shuffle_v2r8(sz,sxy,GMX_FJSP_SHUFFLE2(0,0));
+    syz = _fjsp_shuffle_v2r8(sxy,sz,GMX_FJSP_SHUFFLE2(0,1));
+    
+    t1  = _fjsp_add_v2r8(t1,sxy);
+    t2  = _fjsp_add_v2r8(t2,szx);
+    t3  = _fjsp_add_v2r8(t3,syz);
+    t4  = _fjsp_add_v2r8(t4,sxy);
+    t5  = _fjsp_add_v2r8(t5,szx);
+    t6  = _fjsp_add_v2r8(t6,syz);
+    
+    *x1  = _fjsp_shuffle_v2r8(t1,t1,GMX_FJSP_SHUFFLE2(0,0));
+    *y1  = _fjsp_shuffle_v2r8(t1,t1,GMX_FJSP_SHUFFLE2(1,1));
+    *z1  = _fjsp_shuffle_v2r8(t2,t2,GMX_FJSP_SHUFFLE2(0,0));
+    *x2  = _fjsp_shuffle_v2r8(t2,t2,GMX_FJSP_SHUFFLE2(1,1));
+    *y2  = _fjsp_shuffle_v2r8(t3,t3,GMX_FJSP_SHUFFLE2(0,0));
+    *z2  = _fjsp_shuffle_v2r8(t3,t3,GMX_FJSP_SHUFFLE2(1,1));
+    *x3  = _fjsp_shuffle_v2r8(t4,t4,GMX_FJSP_SHUFFLE2(0,0));
+    *y3  = _fjsp_shuffle_v2r8(t4,t4,GMX_FJSP_SHUFFLE2(1,1));
+    *z3  = _fjsp_shuffle_v2r8(t5,t5,GMX_FJSP_SHUFFLE2(0,0));
+    *x4  = _fjsp_shuffle_v2r8(t5,t5,GMX_FJSP_SHUFFLE2(1,1));
+    *y4  = _fjsp_shuffle_v2r8(t6,t6,GMX_FJSP_SHUFFLE2(0,0));
+    *z4  = _fjsp_shuffle_v2r8(t6,t6,GMX_FJSP_SHUFFLE2(1,1));
+}
+
+
+
+static gmx_inline void
+gmx_fjsp_load_1rvec_1ptr_swizzle_v2r8(const double * gmx_restrict p1,
+                                  _fjsp_v2r8 * gmx_restrict x, _fjsp_v2r8 * gmx_restrict y, _fjsp_v2r8 * gmx_restrict z)
+{
+        *x            = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),p1);
+     *y            = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),p1+1);
+     *z            = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),p1+2);
+}
+
+static gmx_inline void
+gmx_fjsp_load_3rvec_1ptr_swizzle_v2r8(const double * gmx_restrict p1,
+                                  _fjsp_v2r8 * gmx_restrict x1, _fjsp_v2r8 * gmx_restrict y1, _fjsp_v2r8 * gmx_restrict z1,
+                                  _fjsp_v2r8 * gmx_restrict x2, _fjsp_v2r8 * gmx_restrict y2, _fjsp_v2r8 * gmx_restrict z2,
+                                  _fjsp_v2r8 * gmx_restrict x3, _fjsp_v2r8 * gmx_restrict y3, _fjsp_v2r8 * gmx_restrict z3)
+{
+        *x1            = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),p1);
+     *y1            = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),p1+1);
+     *z1            = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),p1+2);
+        *x2            = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),p1+3);
+     *y2            = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),p1+4);
+     *z2            = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),p1+5);
+        *x3            = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),p1+6);
+     *y3            = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),p1+7);
+     *z3            = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),p1+8);
+}
+
+static gmx_inline void
+gmx_fjsp_load_4rvec_1ptr_swizzle_v2r8(const double * gmx_restrict p1,
+                                  _fjsp_v2r8 * gmx_restrict x1, _fjsp_v2r8 * gmx_restrict y1, _fjsp_v2r8 * gmx_restrict z1,
+                                  _fjsp_v2r8 * gmx_restrict x2, _fjsp_v2r8 * gmx_restrict y2, _fjsp_v2r8 * gmx_restrict z2,
+                                  _fjsp_v2r8 * gmx_restrict x3, _fjsp_v2r8 * gmx_restrict y3, _fjsp_v2r8 * gmx_restrict z3,
+                                  _fjsp_v2r8 * gmx_restrict x4, _fjsp_v2r8 * gmx_restrict y4, _fjsp_v2r8 * gmx_restrict z4)
+{
+    *x1            = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),p1);
+    *y1            = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),p1+1);
+    *z1            = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),p1+2);
+    *x2            = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),p1+3);
+    *y2            = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),p1+4);
+    *z2            = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),p1+5);
+    *x3            = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),p1+6);
+    *y3            = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),p1+7);
+    *z3            = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),p1+8);
+    *x4            = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),p1+9);
+    *y4            = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),p1+10);
+    *z4            = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),p1+11);
+}
+
+
+static gmx_inline void
+gmx_fjsp_load_1rvec_2ptr_swizzle_v2r8(const double * gmx_restrict ptrA,
+                                  const double * gmx_restrict ptrB,
+                                  _fjsp_v2r8 * gmx_restrict x1, _fjsp_v2r8 * gmx_restrict y1, _fjsp_v2r8 * gmx_restrict z1)
+{
+    _fjsp_v2r8 t1,t2,t3,t4;
+    t1           = _fjsp_load_v2r8(ptrA);
+    t2           = _fjsp_load_v2r8(ptrB);
+    t3           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),ptrA+2);
+    t4           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),ptrB+2);
+    GMX_FJSP_TRANSPOSE2_V2R8(t1,t2);
+    *x1          = t1;
+    *y1          = t2;
+    *z1          = _fjsp_unpacklo_v2r8(t3,t4);
+}
+
+static gmx_inline void
+gmx_fjsp_load_3rvec_2ptr_swizzle_v2r8(const double * gmx_restrict ptrA, const double * gmx_restrict ptrB,
+                                  _fjsp_v2r8 * gmx_restrict x1, _fjsp_v2r8 * gmx_restrict y1, _fjsp_v2r8 * gmx_restrict z1,
+                                  _fjsp_v2r8 * gmx_restrict x2, _fjsp_v2r8 * gmx_restrict y2, _fjsp_v2r8 * gmx_restrict z2,
+                                  _fjsp_v2r8 * gmx_restrict x3, _fjsp_v2r8 * gmx_restrict y3, _fjsp_v2r8 * gmx_restrict z3)
+{
+_fjsp_v2r8 t1,t2,t3,t4,t5,t6,t7,t8,t9,t10;
+    t1           = _fjsp_load_v2r8(ptrA);
+    t2           = _fjsp_load_v2r8(ptrB);
+    t3           = _fjsp_load_v2r8(ptrA+2);
+    t4           = _fjsp_load_v2r8(ptrB+2);
+    t5           = _fjsp_load_v2r8(ptrA+4);
+    t6           = _fjsp_load_v2r8(ptrB+4);
+    t7           = _fjsp_load_v2r8(ptrA+6);
+    t8           = _fjsp_load_v2r8(ptrB+6);
+    t9           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),ptrA+8);
+    t10          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),ptrB+8);
+    GMX_FJSP_TRANSPOSE2_V2R8(t1,t2);
+    GMX_FJSP_TRANSPOSE2_V2R8(t3,t4);
+    GMX_FJSP_TRANSPOSE2_V2R8(t5,t6);
+    GMX_FJSP_TRANSPOSE2_V2R8(t7,t8);
+    *x1          = t1;
+    *y1          = t2;
+    *z1          = t3;
+    *x2          = t4;
+    *y2          = t5;
+    *z2          = t6;
+    *x3          = t7;
+    *y3          = t8;
+    *z3          = _fjsp_unpacklo_v2r8(t9,t10);
+}
+
+
+static gmx_inline void
+gmx_fjsp_load_4rvec_2ptr_swizzle_v2r8(const double * gmx_restrict ptrA, const double * gmx_restrict ptrB,
+                                  _fjsp_v2r8 * gmx_restrict x1, _fjsp_v2r8 * gmx_restrict y1, _fjsp_v2r8 * gmx_restrict z1,
+                                  _fjsp_v2r8 * gmx_restrict x2, _fjsp_v2r8 * gmx_restrict y2, _fjsp_v2r8 * gmx_restrict z2,
+                                  _fjsp_v2r8 * gmx_restrict x3, _fjsp_v2r8 * gmx_restrict y3, _fjsp_v2r8 * gmx_restrict z3,
+                                  _fjsp_v2r8 * gmx_restrict x4, _fjsp_v2r8 * gmx_restrict y4, _fjsp_v2r8 * gmx_restrict z4)
+{
+    _fjsp_v2r8 t1,t2,t3,t4,t5,t6;
+    t1           = _fjsp_load_v2r8(ptrA);
+    t2           = _fjsp_load_v2r8(ptrB);
+    t3           = _fjsp_load_v2r8(ptrA+2);
+    t4           = _fjsp_load_v2r8(ptrB+2);
+    t5           = _fjsp_load_v2r8(ptrA+4);
+    t6           = _fjsp_load_v2r8(ptrB+4);
+    GMX_FJSP_TRANSPOSE2_V2R8(t1,t2);
+    GMX_FJSP_TRANSPOSE2_V2R8(t3,t4);
+    GMX_FJSP_TRANSPOSE2_V2R8(t5,t6);
+    *x1          = t1;
+    *y1          = t2;
+    *z1          = t3;
+    *x2          = t4;
+    *y2          = t5;
+    *z2          = t6;
+    t1           = _fjsp_load_v2r8(ptrA+6);
+    t2           = _fjsp_load_v2r8(ptrB+6);
+    t3           = _fjsp_load_v2r8(ptrA+8);
+    t4           = _fjsp_load_v2r8(ptrB+8);
+    t5           = _fjsp_load_v2r8(ptrA+10);
+    t6           = _fjsp_load_v2r8(ptrB+10);
+    GMX_FJSP_TRANSPOSE2_V2R8(t1,t2);
+    GMX_FJSP_TRANSPOSE2_V2R8(t3,t4);
+    GMX_FJSP_TRANSPOSE2_V2R8(t5,t6);
+    *x3          = t1;
+    *y3          = t2;
+    *z3          = t3;
+    *x4          = t4;
+    *y4          = t5;
+    *z4          = t6;
+}
+
+
+static void
+gmx_fjsp_decrement_1rvec_1ptr_swizzle_v2r8(double * gmx_restrict ptrA,
+                                       _fjsp_v2r8 x1, _fjsp_v2r8 y1, _fjsp_v2r8 z1)
+{
+    _fjsp_v2r8 t1,t2,t3;
+    
+    t1           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),ptrA);
+    t2           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),ptrA+1);
+    t3           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),ptrA+2);
+    
+    t1           = _fjsp_sub_v2r8(t1,x1);
+    t2           = _fjsp_sub_v2r8(t2,y1);
+    t3           = _fjsp_sub_v2r8(t3,z1);
+    _fjsp_storel_v2r8(ptrA,t1);
+    _fjsp_storel_v2r8(ptrA+1,t2);
+    _fjsp_storel_v2r8(ptrA+2,t3);
+}
+
+static void
+gmx_fjsp_decrement_fma_1rvec_1ptr_swizzle_v2r8(double * gmx_restrict ptrA, _fjsp_v2r8 fscal,
+                                          _fjsp_v2r8 dx1, _fjsp_v2r8 dy1, _fjsp_v2r8 dz1)
+{
+  _fjsp_v2r8 t1,t2,t3;
+
+  t1           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),ptrA);
+  t2           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),ptrA+1);
+  t3           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),ptrA+2);
+
+  t1           = _fjsp_nmsub_v2r8(fscal,dx1,t1);
+  t2           = _fjsp_nmsub_v2r8(fscal,dy1,t2);
+  t3           = _fjsp_nmsub_v2r8(fscal,dz1,t3);
+  _fjsp_storel_v2r8(ptrA,t1);
+  _fjsp_storel_v2r8(ptrA+1,t2);
+  _fjsp_storel_v2r8(ptrA+2,t3);
+}
+
+
+static void
+gmx_fjsp_decrement_3rvec_1ptr_swizzle_v2r8(double * gmx_restrict ptrA,
+                                       _fjsp_v2r8 x1, _fjsp_v2r8 y1, _fjsp_v2r8 z1,
+                                       _fjsp_v2r8 x2, _fjsp_v2r8 y2, _fjsp_v2r8 z2,
+                                       _fjsp_v2r8 x3, _fjsp_v2r8 y3, _fjsp_v2r8 z3) 
+{
+    _fjsp_v2r8 t1,t2,t3,t4,t5;
+    
+    t1          = _fjsp_load_v2r8(ptrA);
+    t2          = _fjsp_load_v2r8(ptrA+2);
+    t3          = _fjsp_load_v2r8(ptrA+4);
+    t4          = _fjsp_load_v2r8(ptrA+6);
+    t5          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),ptrA+8);
+    
+    x1          = _fjsp_unpacklo_v2r8(x1,y1);
+    z1          = _fjsp_unpacklo_v2r8(z1,x2);
+    y2          = _fjsp_unpacklo_v2r8(y2,z2);
+    x3          = _fjsp_unpacklo_v2r8(x3,y3);
+    /* nothing to be done for z3 */
+    
+    t1          = _fjsp_sub_v2r8(t1,x1);
+    t2          = _fjsp_sub_v2r8(t2,z1);
+    t3          = _fjsp_sub_v2r8(t3,y2);
+    t4          = _fjsp_sub_v2r8(t4,x3);
+    t5          = _fjsp_sub_v2r8(t5,z3);
+    _fjsp_storel_v2r8(ptrA,t1);
+    _fjsp_storeh_v2r8(ptrA+1,t1);
+    _fjsp_storel_v2r8(ptrA+2,t2);
+    _fjsp_storeh_v2r8(ptrA+3,t2);
+    _fjsp_storel_v2r8(ptrA+4,t3);
+    _fjsp_storeh_v2r8(ptrA+5,t3);
+    _fjsp_storel_v2r8(ptrA+6,t4);
+    _fjsp_storeh_v2r8(ptrA+7,t4);
+    _fjsp_storel_v2r8(ptrA+8,t5);
+}
+
+
+static void
+gmx_fjsp_decrement_4rvec_1ptr_swizzle_v2r8(double * gmx_restrict ptrA,
+                                       _fjsp_v2r8 x1, _fjsp_v2r8 y1, _fjsp_v2r8 z1,
+                                       _fjsp_v2r8 x2, _fjsp_v2r8 y2, _fjsp_v2r8 z2,
+                                       _fjsp_v2r8 x3, _fjsp_v2r8 y3, _fjsp_v2r8 z3,
+                                       _fjsp_v2r8 x4, _fjsp_v2r8 y4, _fjsp_v2r8 z4) 
+{
+    _fjsp_v2r8 t1,t2,t3,t4,t5,t6;
+    
+    t1          = _fjsp_load_v2r8(ptrA);
+    t2          = _fjsp_load_v2r8(ptrA+2);
+    t3          = _fjsp_load_v2r8(ptrA+4);
+    t4          = _fjsp_load_v2r8(ptrA+6);
+    t5          = _fjsp_load_v2r8(ptrA+8);
+    t6          = _fjsp_load_v2r8(ptrA+10);
+    
+    x1          = _fjsp_unpacklo_v2r8(x1,y1);
+    z1          = _fjsp_unpacklo_v2r8(z1,x2);
+    y2          = _fjsp_unpacklo_v2r8(y2,z2);
+    x3          = _fjsp_unpacklo_v2r8(x3,y3);
+    z3          = _fjsp_unpacklo_v2r8(z3,x4);
+    y4          = _fjsp_unpacklo_v2r8(y4,z4);
+    
+    _fjsp_storel_v2r8(ptrA,    _fjsp_sub_v2r8( t1,x1 ));
+    _fjsp_storeh_v2r8(ptrA+1,  _fjsp_sub_v2r8( t1,x1 ));
+    _fjsp_storel_v2r8(ptrA+2,  _fjsp_sub_v2r8( t2,z1 ));
+    _fjsp_storeh_v2r8(ptrA+3,  _fjsp_sub_v2r8( t2,z1 ));
+    _fjsp_storel_v2r8(ptrA+4,  _fjsp_sub_v2r8( t3,y2 ));
+    _fjsp_storeh_v2r8(ptrA+5,  _fjsp_sub_v2r8( t3,y2 ));
+    _fjsp_storel_v2r8(ptrA+6,  _fjsp_sub_v2r8( t4,x3 ));
+    _fjsp_storeh_v2r8(ptrA+7,  _fjsp_sub_v2r8( t4,x3 ));
+    _fjsp_storel_v2r8(ptrA+8,  _fjsp_sub_v2r8( t5,z3 ));
+    _fjsp_storeh_v2r8(ptrA+9,  _fjsp_sub_v2r8( t5,z3 ));
+    _fjsp_storel_v2r8(ptrA+10, _fjsp_sub_v2r8( t6,y4 ));
+    _fjsp_storeh_v2r8(ptrA+11, _fjsp_sub_v2r8( t6,y4 ));
+}
+
+static void
+gmx_fjsp_decrement_1rvec_2ptr_swizzle_v2r8(double * gmx_restrict ptrA, double * gmx_restrict ptrB,
+                                          _fjsp_v2r8 x1, _fjsp_v2r8 y1, _fjsp_v2r8 z1)
+{
+  _fjsp_v2r8 t1,t2,t3,t4,t5,t6,t7;
+    
+  t1          = _fjsp_load_v2r8(ptrA);
+  t2          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),ptrA+2);
+  t3          = _fjsp_load_v2r8(ptrB);
+  t4          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),ptrB+2);
+    
+  t5          = _fjsp_unpacklo_v2r8(x1,y1);
+  t6          = _fjsp_unpackhi_v2r8(x1,y1);
+  t7          = _fjsp_unpackhi_v2r8(z1,z1);
+    
+  t1          = _fjsp_sub_v2r8(t1,t5);
+  t2          = _fjsp_sub_v2r8(t2,z1);
+    
+  t3          = _fjsp_sub_v2r8(t3,t6);
+  t4          = _fjsp_sub_v2r8(t4,t7);
+    
+  _fjsp_storel_v2r8(ptrA,t1);
+  _fjsp_storeh_v2r8(ptrA+1,t1);
+  _fjsp_storel_v2r8(ptrA+2,t2);
+  _fjsp_storel_v2r8(ptrB,t3);
+  _fjsp_storeh_v2r8(ptrB+1,t3);
+  _fjsp_storel_v2r8(ptrB+2,t4);
+}
+
+
+static void
+gmx_fjsp_decrement_fma_1rvec_2ptr_swizzle_v2r8(double * gmx_restrict ptrA, double * gmx_restrict ptrB,
+                                              _fjsp_v2r8 fscal, _fjsp_v2r8 dx1, _fjsp_v2r8 dy1, _fjsp_v2r8 dz1)
+{
+  _fjsp_v2r8 t1,t2,t3,t4,t5,t6,t7,fscalA,fscalB;
+    
+    t1          = _fjsp_load_v2r8(ptrA);
+    t2          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),ptrA+2);
+    t3          = _fjsp_load_v2r8(ptrB);
+    t4          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),ptrB+2);
+    fscalA      = _fjsp_unpacklo_v2r8(fscal,fscal);
+    fscalB      = _fjsp_unpackhi_v2r8(fscal,fscal);
+    
+    t5          = _fjsp_unpacklo_v2r8(dx1,dy1);
+    t6          = _fjsp_unpackhi_v2r8(dx1,dy1);
+    t7          = _fjsp_unpackhi_v2r8(dz1,dz1);
+    
+    t1          = _fjsp_nmsub_v2r8(fscalA,t5,t1);
+    t2          = _fjsp_nmsub_v2r8(fscalA,dz1,t2);
+    
+    t3          = _fjsp_nmsub_v2r8(fscalB,t6,t3);
+    t4          = _fjsp_nmsub_v2r8(fscalB,t7,t4);
+    
+    _fjsp_storel_v2r8(ptrA,t1);
+    _fjsp_storeh_v2r8(ptrA+1,t1);
+    _fjsp_storel_v2r8(ptrA+2,t2);
+    _fjsp_storel_v2r8(ptrB,t3);
+    _fjsp_storeh_v2r8(ptrB+1,t3);
+    _fjsp_storel_v2r8(ptrB+2,t4);
+}
+
+
+static void
+gmx_fjsp_decrement_3rvec_2ptr_swizzle_v2r8(double * gmx_restrict ptrA, double * gmx_restrict ptrB,
+                                       _fjsp_v2r8 x1, _fjsp_v2r8 y1, _fjsp_v2r8 z1,
+                                       _fjsp_v2r8 x2, _fjsp_v2r8 y2, _fjsp_v2r8 z2,
+                                       _fjsp_v2r8 x3, _fjsp_v2r8 y3, _fjsp_v2r8 z3) 
+{
+    _fjsp_v2r8 t1,t2,t3,t4,t5,t6,t7,t8,t9,t10;
+    _fjsp_v2r8 tA,tB,tC,tD,tE,tF,tG,tH,tI;
+    
+    t1          = _fjsp_load_v2r8(ptrA);
+    t2          = _fjsp_load_v2r8(ptrA+2);
+    t3          = _fjsp_load_v2r8(ptrA+4);
+    t4          = _fjsp_load_v2r8(ptrA+6);
+    t5          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),ptrA+8);
+    t6          = _fjsp_load_v2r8(ptrB);
+    t7          = _fjsp_load_v2r8(ptrB+2);
+    t8          = _fjsp_load_v2r8(ptrB+4);
+    t9          = _fjsp_load_v2r8(ptrB+6);
+    t10         = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),ptrB+8);
+    
+    tA          = _fjsp_unpacklo_v2r8(x1,y1);
+    tB          = _fjsp_unpackhi_v2r8(x1,y1);
+    tC          = _fjsp_unpacklo_v2r8(z1,x2);
+    tD          = _fjsp_unpackhi_v2r8(z1,x2);
+    tE          = _fjsp_unpacklo_v2r8(y2,z2);
+    tF          = _fjsp_unpackhi_v2r8(y2,z2);
+    tG          = _fjsp_unpacklo_v2r8(x3,y3);
+    tH          = _fjsp_unpackhi_v2r8(x3,y3);
+    tI          = _fjsp_unpackhi_v2r8(z3,z3);
+    
+    t1          = _fjsp_sub_v2r8(t1,tA);
+    t2          = _fjsp_sub_v2r8(t2,tC);
+    t3          = _fjsp_sub_v2r8(t3,tE);
+    t4          = _fjsp_sub_v2r8(t4,tG);
+    t5          = _fjsp_sub_v2r8(t5,z3);
+    
+    t6          = _fjsp_sub_v2r8(t6,tB);
+    t7          = _fjsp_sub_v2r8(t7,tD);
+    t8          = _fjsp_sub_v2r8(t8,tF);
+    t9          = _fjsp_sub_v2r8(t9,tH);
+    t10         = _fjsp_sub_v2r8(t10,tI);
+    
+    _fjsp_storel_v2r8(ptrA,t1);
+    _fjsp_storeh_v2r8(ptrA+1,t1);
+    _fjsp_storel_v2r8(ptrA+2,t2);
+    _fjsp_storeh_v2r8(ptrA+3,t2);
+    _fjsp_storel_v2r8(ptrA+4,t3);
+    _fjsp_storeh_v2r8(ptrA+5,t3);
+    _fjsp_storel_v2r8(ptrA+6,t4);
+    _fjsp_storeh_v2r8(ptrA+7,t4);
+    _fjsp_storel_v2r8(ptrA+8,t5);
+    _fjsp_storel_v2r8(ptrB,t6);
+    _fjsp_storeh_v2r8(ptrB+1,t6);
+    _fjsp_storel_v2r8(ptrB+2,t7);
+    _fjsp_storeh_v2r8(ptrB+3,t7);
+    _fjsp_storel_v2r8(ptrB+4,t8);
+    _fjsp_storeh_v2r8(ptrB+5,t8);
+    _fjsp_storel_v2r8(ptrB+6,t9);
+    _fjsp_storeh_v2r8(ptrB+7,t9);
+    _fjsp_storel_v2r8(ptrB+8,t10);
+}
+
+
+static void
+gmx_fjsp_decrement_4rvec_2ptr_swizzle_v2r8(double * gmx_restrict ptrA, double * gmx_restrict ptrB,
+                                       _fjsp_v2r8 x1, _fjsp_v2r8 y1, _fjsp_v2r8 z1,
+                                       _fjsp_v2r8 x2, _fjsp_v2r8 y2, _fjsp_v2r8 z2,
+                                       _fjsp_v2r8 x3, _fjsp_v2r8 y3, _fjsp_v2r8 z3,
+                                       _fjsp_v2r8 x4, _fjsp_v2r8 y4, _fjsp_v2r8 z4) 
+{
+    _fjsp_v2r8 t1,t2,t3,t4,t5,t6,t7,t8,t9,t10,t11,t12;
+    _fjsp_v2r8 tA,tB,tC,tD,tE,tF,tG,tH,tI,tJ,tK,tL;
+    
+    t1          = _fjsp_load_v2r8(ptrA);
+    t2          = _fjsp_load_v2r8(ptrA+2);
+    t3          = _fjsp_load_v2r8(ptrA+4);
+    t4          = _fjsp_load_v2r8(ptrA+6);
+    t5          = _fjsp_load_v2r8(ptrA+8);
+    t6          = _fjsp_load_v2r8(ptrA+10);
+    t7          = _fjsp_load_v2r8(ptrB);
+    t8          = _fjsp_load_v2r8(ptrB+2);
+    t9          = _fjsp_load_v2r8(ptrB+4);
+    t10         = _fjsp_load_v2r8(ptrB+6);
+    t11         = _fjsp_load_v2r8(ptrB+8);
+    t12         = _fjsp_load_v2r8(ptrB+10);
+    
+    tA          = _fjsp_unpacklo_v2r8(x1,y1);
+    tB          = _fjsp_unpackhi_v2r8(x1,y1);
+    tC          = _fjsp_unpacklo_v2r8(z1,x2);
+    tD          = _fjsp_unpackhi_v2r8(z1,x2);
+    tE          = _fjsp_unpacklo_v2r8(y2,z2);
+    tF          = _fjsp_unpackhi_v2r8(y2,z2);
+    tG          = _fjsp_unpacklo_v2r8(x3,y3);
+    tH          = _fjsp_unpackhi_v2r8(x3,y3);
+    tI          = _fjsp_unpacklo_v2r8(z3,x4);
+    tJ          = _fjsp_unpackhi_v2r8(z3,x4);
+    tK          = _fjsp_unpacklo_v2r8(y4,z4);
+    tL          = _fjsp_unpackhi_v2r8(y4,z4);
+    
+    t1          = _fjsp_sub_v2r8(t1,tA);
+    t2          = _fjsp_sub_v2r8(t2,tC);
+    t3          = _fjsp_sub_v2r8(t3,tE);
+    t4          = _fjsp_sub_v2r8(t4,tG);
+    t5          = _fjsp_sub_v2r8(t5,tI);
+    t6          = _fjsp_sub_v2r8(t6,tK);
+    
+    t7          = _fjsp_sub_v2r8(t7,tB);
+    t8          = _fjsp_sub_v2r8(t8,tD);
+    t9          = _fjsp_sub_v2r8(t9,tF);
+    t10         = _fjsp_sub_v2r8(t10,tH);
+    t11         = _fjsp_sub_v2r8(t11,tJ);
+    t12         = _fjsp_sub_v2r8(t12,tL);
+    
+    _fjsp_storel_v2r8(ptrA,  t1);
+    _fjsp_storeh_v2r8(ptrA+1,t1);
+    _fjsp_storel_v2r8(ptrA+2,t2);
+    _fjsp_storeh_v2r8(ptrA+3,t2);
+    _fjsp_storel_v2r8(ptrA+4,t3);
+    _fjsp_storeh_v2r8(ptrA+5,t3);
+    _fjsp_storel_v2r8(ptrA+6,t4);
+    _fjsp_storeh_v2r8(ptrA+7,t4);
+    _fjsp_storel_v2r8(ptrA+8,t5);
+    _fjsp_storeh_v2r8(ptrA+9,t5);
+    _fjsp_storel_v2r8(ptrA+10,t6);
+    _fjsp_storeh_v2r8(ptrA+11,t6);
+    _fjsp_storel_v2r8(ptrB,  t7);
+    _fjsp_storeh_v2r8(ptrB+1,t7);
+    _fjsp_storel_v2r8(ptrB+2,t8);
+    _fjsp_storeh_v2r8(ptrB+3,t8);
+    _fjsp_storel_v2r8(ptrB+4,t9);
+    _fjsp_storeh_v2r8(ptrB+5,t9);
+    _fjsp_storel_v2r8(ptrB+6,t10);
+    _fjsp_storeh_v2r8(ptrB+7,t10);
+    _fjsp_storel_v2r8(ptrB+8,t11);
+    _fjsp_storeh_v2r8(ptrB+9,t11);
+    _fjsp_storel_v2r8(ptrB+10,t12);
+    _fjsp_storeh_v2r8(ptrB+11,t12);
+}
+
+
+
+static gmx_inline void
+gmx_fjsp_update_iforce_1atom_swizzle_v2r8(_fjsp_v2r8 fix1, _fjsp_v2r8 fiy1, _fjsp_v2r8 fiz1,
+                                      double * gmx_restrict fptr,
+                                      double * gmx_restrict fshiftptr)
+{
+    __m128d t1,t2,t3,t4;
+    
+    /* transpose data */
+    t1 = fix1;
+    fix1 = _fjsp_unpacklo_v2r8(fix1,fiy1); /* y0 x0 */
+    fiy1 = _fjsp_unpackhi_v2r8(t1,fiy1);   /* y1 x1 */
+    
+    fix1 = _fjsp_add_v2r8(fix1,fiy1);
+    fiz1 = _fjsp_add_v2r8( fiz1, _fjsp_unpackhi_v2r8(fiz1,fiz1 ));
+    
+    t4 = _fjsp_add_v2r8( _fjsp_load_v2r8(fptr), fix1 );
+    _fjsp_storel_v2r8( fptr, t4 );
+    _fjsp_storeh_v2r8( fptr+1, t4 );
+    _fjsp_storel_v2r8( fptr+2, _fjsp_add_v2r8( _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),fptr+2), fiz1 ));
+    
+    t4 = _fjsp_add_v2r8( _fjsp_load_v2r8(fshiftptr), fix1 );
+    _fjsp_storel_v2r8( fshiftptr, t4 );
+    _fjsp_storeh_v2r8( fshiftptr+1, t4 );
+    _fjsp_storel_v2r8( fshiftptr+2, _fjsp_add_v2r8( _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),fshiftptr+2), fiz1 ));
+}
+
+static gmx_inline void
+gmx_fjsp_update_iforce_3atom_swizzle_v2r8(_fjsp_v2r8 fix1, _fjsp_v2r8 fiy1, _fjsp_v2r8 fiz1,
+                                      _fjsp_v2r8 fix2, _fjsp_v2r8 fiy2, _fjsp_v2r8 fiz2,
+                                      _fjsp_v2r8 fix3, _fjsp_v2r8 fiy3, _fjsp_v2r8 fiz3,
+                                      double * gmx_restrict fptr,
+                                      double * gmx_restrict fshiftptr)
+{
+    __m128d t1,t2,t3,t4,t5,t6;
+    
+    /* transpose data */
+    GMX_FJSP_TRANSPOSE2_V2R8(fix1,fiy1);
+    GMX_FJSP_TRANSPOSE2_V2R8(fiz1,fix2);
+    GMX_FJSP_TRANSPOSE2_V2R8(fiy2,fiz2);
+    t1 = fix3;
+    fix3 = _fjsp_unpacklo_v2r8(fix3,fiy3); /* y0 x0 */
+    fiy3 = _fjsp_unpackhi_v2r8(t1,fiy3);   /* y1 x1 */
+    
+    fix1 = _fjsp_add_v2r8(fix1,fiy1);
+    fiz1 = _fjsp_add_v2r8(fiz1,fix2);
+    fiy2 = _fjsp_add_v2r8(fiy2,fiz2);
+    
+    fix3 = _fjsp_add_v2r8(fix3,fiy3);
+    fiz3 = _fjsp_add_v2r8( fiz3, _fjsp_unpackhi_v2r8(fiz3,fiz3));
+    
+    t3 = _fjsp_add_v2r8( _fjsp_load_v2r8(fptr), fix1 );
+    t4 = _fjsp_add_v2r8( _fjsp_load_v2r8(fptr+2), fiz1 );
+    t5 = _fjsp_add_v2r8( _fjsp_load_v2r8(fptr+4), fiy2 );
+    t6 = _fjsp_add_v2r8( _fjsp_load_v2r8(fptr+6), fix3 );
+
+    _fjsp_storel_v2r8( fptr,   t3 );
+    _fjsp_storeh_v2r8( fptr+1, t3 );
+    _fjsp_storel_v2r8( fptr+2, t4 );
+    _fjsp_storeh_v2r8( fptr+3, t4 );
+    _fjsp_storel_v2r8( fptr+4, t5 );
+    _fjsp_storeh_v2r8( fptr+5, t5 );
+    _fjsp_storel_v2r8( fptr+6, t6 );
+    _fjsp_storeh_v2r8( fptr+7, t6 );
+    _fjsp_storel_v2r8( fptr+8, _fjsp_add_v2r8( _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),fptr+8), fiz3 ));
+    
+    fix1 = _fjsp_add_v2r8(fix1,fix3);
+    t1   = _fjsp_shuffle_v2r8(fiz1,fiy2,GMX_FJSP_SHUFFLE2(0,1));
+    fix1 = _fjsp_add_v2r8(fix1,t1); /* x and y sums */
+    
+    t2   = _fjsp_shuffle_v2r8(fiy2,fiy2,GMX_FJSP_SHUFFLE2(1,1));
+    fiz1 = _fjsp_add_v2r8(fiz1,fiz3);
+    fiz1 = _fjsp_add_v2r8(fiz1,t2); /* z sum */
+    
+    t3 = _fjsp_add_v2r8( _fjsp_load_v2r8(fshiftptr), fix1 );
+    _fjsp_storel_v2r8( fshiftptr, t3 );
+    _fjsp_storeh_v2r8( fshiftptr+1, t3 );
+    _fjsp_storel_v2r8( fshiftptr+2, _fjsp_add_v2r8( _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),fshiftptr+2), fiz1 ));
+}
+
+
+static gmx_inline void
+gmx_fjsp_update_iforce_4atom_swizzle_v2r8(_fjsp_v2r8 fix1, _fjsp_v2r8 fiy1, _fjsp_v2r8 fiz1,
+                                      _fjsp_v2r8 fix2, _fjsp_v2r8 fiy2, _fjsp_v2r8 fiz2,
+                                      _fjsp_v2r8 fix3, _fjsp_v2r8 fiy3, _fjsp_v2r8 fiz3,
+                                      _fjsp_v2r8 fix4, _fjsp_v2r8 fiy4, _fjsp_v2r8 fiz4,
+                                      double * gmx_restrict fptr,
+                                      double * gmx_restrict fshiftptr)
+{
+    __m128d t1,t2,t3,t4,t5,t6,t7,t8;
+    
+    /* transpose data */
+    GMX_FJSP_TRANSPOSE2_V2R8(fix1,fiy1);
+    GMX_FJSP_TRANSPOSE2_V2R8(fiz1,fix2);
+    GMX_FJSP_TRANSPOSE2_V2R8(fiy2,fiz2);
+    GMX_FJSP_TRANSPOSE2_V2R8(fix3,fiy3);
+    GMX_FJSP_TRANSPOSE2_V2R8(fiz3,fix4);
+    GMX_FJSP_TRANSPOSE2_V2R8(fiy4,fiz4);
+    
+    fix1 = _fjsp_add_v2r8(fix1,fiy1);
+    fiz1 = _fjsp_add_v2r8(fiz1,fix2);
+    fiy2 = _fjsp_add_v2r8(fiy2,fiz2);
+    fix3 = _fjsp_add_v2r8(fix3,fiy3);
+    fiz3 = _fjsp_add_v2r8(fiz3,fix4);
+    fiy4 = _fjsp_add_v2r8(fiy4,fiz4);
+    
+    t3 = _fjsp_add_v2r8( _fjsp_load_v2r8(fptr),    fix1 );
+    t4 = _fjsp_add_v2r8( _fjsp_load_v2r8(fptr+2),  fiz1 );
+    t5 = _fjsp_add_v2r8( _fjsp_load_v2r8(fptr+4),  fiy2 );
+    t6 = _fjsp_add_v2r8( _fjsp_load_v2r8(fptr+6),  fix3 );
+    t7 = _fjsp_add_v2r8( _fjsp_load_v2r8(fptr+8),  fiz3 );
+    t8 = _fjsp_add_v2r8( _fjsp_load_v2r8(fptr+10), fiy4 );
+    _fjsp_storel_v2r8( fptr,    t3 );
+    _fjsp_storeh_v2r8( fptr+1,  t3 );
+    _fjsp_storel_v2r8( fptr+2,  t4 );
+    _fjsp_storeh_v2r8( fptr+3,  t4 );
+    _fjsp_storel_v2r8( fptr+4,  t5 );
+    _fjsp_storeh_v2r8( fptr+5,  t5 );
+    _fjsp_storel_v2r8( fptr+6,  t6 );
+    _fjsp_storeh_v2r8( fptr+7,  t6 );
+    _fjsp_storel_v2r8( fptr+8,  t7 );
+    _fjsp_storeh_v2r8( fptr+9,  t7 );
+    _fjsp_storel_v2r8( fptr+10, t8 );
+    _fjsp_storeh_v2r8( fptr+11, t8 );
+
+    t1 = _fjsp_shuffle_v2r8(fiz1,fiy2,GMX_FJSP_SHUFFLE2(0,1));
+    fix1 = _fjsp_add_v2r8(fix1,t1);
+    t2 = _fjsp_shuffle_v2r8(fiz3,fiy4,GMX_FJSP_SHUFFLE2(0,1));
+    fix3 = _fjsp_add_v2r8(fix3,t2);
+    fix1 = _fjsp_add_v2r8(fix1,fix3); /* x and y sums */
+    
+    fiz1 = _fjsp_add_v2r8(fiz1, _fjsp_unpackhi_v2r8(fiy2,fiy2));
+    fiz3 = _fjsp_add_v2r8(fiz3, _fjsp_unpackhi_v2r8(fiy4,fiy4));
+    fiz1 = _fjsp_add_v2r8(fiz1,fiz3); /* z sum */
+    
+    t3 = _fjsp_add_v2r8( _fjsp_load_v2r8(fshiftptr), fix1 );
+    _fjsp_storel_v2r8( fshiftptr, t3 );
+    _fjsp_storeh_v2r8( fshiftptr+1, t3 );
+    _fjsp_storel_v2r8( fshiftptr+2, _fjsp_add_v2r8( _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),fshiftptr+2), fiz1 ));
+}
+
+
+
+static gmx_inline void
+gmx_fjsp_update_1pot_v2r8(_fjsp_v2r8 pot1, double * gmx_restrict ptrA)
+{
+    pot1 = _fjsp_add_v2r8(pot1, _fjsp_unpackhi_v2r8(pot1,pot1));
+    _fjsp_storel_v2r8(ptrA,_fjsp_add_v2r8(pot1,_fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),ptrA)));
+}
+
+static gmx_inline void
+gmx_fjsp_update_2pot_v2r8(_fjsp_v2r8 pot1, double * gmx_restrict ptrA,
+                      _fjsp_v2r8 pot2, double * gmx_restrict ptrB)
+{
+    GMX_FJSP_TRANSPOSE2_V2R8(pot1,pot2);
+    pot1 = _fjsp_add_v2r8(pot1,pot2);
+    pot2 = _fjsp_unpackhi_v2r8(pot1,pot1);
+    
+    _fjsp_storel_v2r8(ptrA,_fjsp_add_v2r8(pot1,_fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),ptrA)));
+    _fjsp_storel_v2r8(ptrB,_fjsp_add_v2r8(pot2,_fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),ptrB)));
+}
+
+
+#endif /* _kernelutil_sparc64_hpc_ace_double_h_ */
diff --git a/src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/make_nb_kernel_sparc64_hpc_ace_double.py b/src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/make_nb_kernel_sparc64_hpc_ace_double.py
new file mode 100755 (executable)
index 0000000..9b723bd
--- /dev/null
@@ -0,0 +1,538 @@
+#!/usr/bin/python
+#
+# This file is part of the GROMACS molecular simulation package.
+#
+# Copyright (c) 2012, by the GROMACS development team, led by
+# David van der Spoel, Berk Hess, Erik Lindahl, and including many
+# others, as listed in the AUTHORS file in the top-level source
+# directory and at http://www.gromacs.org.
+#
+# GROMACS is free software; you can redistribute it and/or
+# modify it under the terms of the GNU Lesser General Public License
+# as published by the Free Software Foundation; either version 2.1
+# of the License, or (at your option) any later version.
+#
+# GROMACS is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+# Lesser General Public License for more details.
+#
+# You should have received a copy of the GNU Lesser General Public
+# License along with GROMACS; if not, see
+# http://www.gnu.org/licenses, or write to the Free Software Foundation,
+# Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+#
+# If you want to redistribute modifications to GROMACS, please
+# consider that scientific software is very special. Version
+# control is crucial - bugs must be traceable. We will be happy to
+# consider code for inclusion in the official distribution, but
+# derived work must not be called official GROMACS. Details are found
+# in the README & COPYING files - if they are missing, get the
+# official version at http://www.gromacs.org.
+#
+# To help us fund GROMACS development, we humbly ask that you cite
+# the research papers on the package. Check out http://www.gromacs.org
+
+import sys
+import os
+sys.path.append ( "../preprocessor" )
+from gmxpreprocess import gmxpreprocess
+
+# "The happiest programs are programs that write other programs."
+#
+#
+# This script controls the generation of Gromacs nonbonded kernels.
+#
+# We no longer generate kernels on-the-fly, so this file is not run
+# during a Gromacs compile - only when we need to update the kernels (=rarely).
+#
+# To maximize performance, each combination of interactions in Gromacs
+# has a separate nonbonded kernel without conditionals in the code.
+# To avoid writing hundreds of different routines for each architecture,
+# we instead use a custom preprocessor so we can encode the conditionals
+# and expand for-loops (e.g, for water-water interactions)
+# from a general kernel template. While that file will contain quite a
+# few preprocessor directives, it is still an order of magnitude easier
+# to maintain than ~200 different kernels (not to mention it avoids bugs).
+#
+# To actually generate the kernels, this program iteratively calls the
+# preprocessor with different define settings corresponding to all
+# combinations of coulomb/van-der-Waals/geometry options.
+#
+# A main goal in the design was to make this new generator _general_. For
+# this reason we have used a lot of different fields to identify a particular
+# kernel and interaction. Basically, each kernel will have a name like
+#
+# nbkernel_ElecXX_VdwYY_GeomZZ_VF_QQ()
+#
+# Where XX/YY/ZZ/VF are strings to identify what the kernel computes.
+#
+# Elec/Vdw describe the type of interaction for electrostatics and van der Waals.
+# The geometry settings correspond e.g. to water-water or water-particle kernels,
+# and finally the VF setting is V,F,or VF depending on whether we calculate
+# only the potential, only the force, or both of them. The final string (QQ)
+# is the architecture/language/optimization of the kernel.
+#
+Arch       = 'sparc64_hpc_ace_double'
+
+# Explanation of the 'properties':
+#
+# It is cheap to compute r^2, and the kernels require various other functions of r for
+# different kinds of interaction. Depending on the needs of the kernel and the available
+# processor instructions, this will be done in different ways.
+#
+# 'rinv' means we need 1/r, which is calculated as 1/sqrt(r^2).
+# 'rinvsq' means we need 1/(r*r). This is calculated as rinv*rinv if we already did rinv, otherwise 1/r^2.
+# 'r' is similarly calculated as r^2*rinv when needed
+# 'table' means the interaction is tabulated, in which case we will calculate a table index before the interaction
+# 'shift' means the interaction will be modified by a constant to make it zero at the cutoff.
+# 'cutoff' means the interaction is set to 0.0 outside the cutoff
+#
+
+FileHeader = \
+'/*\n' \
+' * This file is part of the GROMACS molecular simulation package.\n' \
+' *\n' \
+' * Copyright (c) 2012, by the GROMACS development team, led by\n' \
+' * David van der Spoel, Berk Hess, Erik Lindahl, and including many\n' \
+' * others, as listed in the AUTHORS file in the top-level source\n' \
+' * directory and at http://www.gromacs.org.\n' \
+' *\n' \
+' * GROMACS is free software; you can redistribute it and/or\n' \
+' * modify it under the terms of the GNU Lesser General Public License\n' \
+' * as published by the Free Software Foundation; either version 2.1\n' \
+' * of the License, or (at your option) any later version.\n' \
+' *\n' \
+' * GROMACS is distributed in the hope that it will be useful,\n' \
+' * but WITHOUT ANY WARRANTY; without even the implied warranty of\n' \
+' * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU\n' \
+' * Lesser General Public License for more details.\n' \
+' *\n' \
+' * You should have received a copy of the GNU Lesser General Public\n' \
+' * License along with GROMACS; if not, see\n' \
+' * http://www.gnu.org/licenses, or write to the Free Software Foundation,\n' \
+' * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.\n' \
+' *\n' \
+' * If you want to redistribute modifications to GROMACS, please\n' \
+' * consider that scientific software is very special. Version\n' \
+' * control is crucial - bugs must be traceable. We will be happy to\n' \
+' * consider code for inclusion in the official distribution, but\n' \
+' * derived work must not be called official GROMACS. Details are found\n' \
+' * in the README & COPYING files - if they are missing, get the\n' \
+' * official version at http://www.gromacs.org.\n' \
+' *\n' \
+' * To help us fund GROMACS development, we humbly ask that you cite\n' \
+' * the research papers on the package. Check out http://www.gromacs.org.\n' \
+' */\n' \
+'/*\n' \
+' * Note: this file was generated by the GROMACS '+Arch+' kernel generator.\n' \
+' */\n'
+
+###############################################
+# ELECTROSTATICS
+# Interactions and flags for them
+###############################################
+ElectrostaticsList = {
+    'None'                    : [],
+    'Coulomb'                 : ['rinv','rinvsq'],
+    'ReactionField'           : ['rinv','rinvsq'],
+    'GeneralizedBorn'         : ['rinv','r'],
+    'CubicSplineTable'        : ['rinv','r','table'],
+    'Ewald'                   : ['rinv','rinvsq','r'],
+}
+
+
+###############################################
+# VAN DER WAALS
+# Interactions and flags for them
+###############################################
+VdwList = {
+    'None'                    : [],
+    'LennardJones'            : ['rinvsq'],
+#    'Buckingham'              : ['rinv','rinvsq','r'], # Disabled for sse4.1 to reduce number of kernels and simply the template
+    'CubicSplineTable'        : ['rinv','r','table'],
+}
+
+
+###############################################
+# MODIFIERS
+# Different ways to adjust/modify interactions to conserve energy
+###############################################
+ModifierList = {
+    'None'                    : [],
+    'ExactCutoff'             : ['exactcutoff'],        # Zero the interaction outside the cutoff, used for reaction-field-zero
+    'PotentialShift'          : ['shift','exactcutoff'],
+    'PotentialSwitch'         : ['rinv','r','switch','exactcutoff']
+}
+
+
+###############################################
+# GEOMETRY COMBINATIONS
+###############################################
+GeometryNameList = [
+    [ 'Particle' , 'Particle' ],
+    [ 'Water3'   , 'Particle' ],
+    [ 'Water3'   , 'Water3'   ],
+    [ 'Water4'   , 'Particle' ],
+    [ 'Water4'   , 'Water4'   ]
+]
+
+
+###############################################
+# POTENTIAL / FORCE
+###############################################
+VFList = [
+    'PotentialAndForce',
+# 'Potential',   # Not used yet
+    'Force'
+]
+
+
+###############################################
+# GEOMETRY PROPERTIES
+###############################################
+# Dictionaries with lists telling which interactions are present
+# 1,2,3 means particles 1,2,3 (but not 0) have electrostatics!
+GeometryElectrostatics = {
+    'Particle'  : [ 0 ],
+    'Particle2' : [ 0 , 1 ],
+    'Particle3' : [ 0 , 1 , 2 ],
+    'Particle4' : [ 0 , 1 , 2 , 3 ],
+    'Water3'    : [ 0 , 1 , 2 ],
+    'Water4'    : [ 1 , 2 , 3 ]
+}
+
+GeometryVdw = {
+    'Particle'  : [ 0 ],
+    'Particle2' : [ 0 , 1 ],
+    'Particle3' : [ 0 , 1 , 2 ],
+    'Particle4' : [ 0 , 1 , 2 , 3 ],
+    'Water3'    : [ 0 ],
+    'Water4'    : [ 0 ]
+}
+
+
+
+
+# Dictionary to abbreviate all strings (mixed from all the lists)
+Abbreviation = {
+    'None'                    : 'None',
+    'Coulomb'                 : 'Coul',
+    'Ewald'                   : 'Ew',
+    'ReactionField'           : 'RF',
+    'GeneralizedBorn'         : 'GB',
+    'CubicSplineTable'        : 'CSTab',
+    'LennardJones'            : 'LJ',
+    'Buckingham'              : 'Bham',
+    'PotentialShift'          : 'Sh',
+    'PotentialSwitch'         : 'Sw',
+    'ExactCutoff'             : 'Cut',
+    'PotentialAndForce'       : 'VF',
+    'Potential'               : 'V',
+    'Force'                   : 'F',
+    'Water3'                  : 'W3',
+    'Water4'                  : 'W4',
+    'Particle'                : 'P1',
+    'Particle2'               : 'P2',
+    'Particle3'               : 'P3',
+    'Particle4'               : 'P4'
+}
+
+
+###############################################
+# Functions
+###############################################
+
+# Return a string with the kernel name from current settings
+def MakeKernelFileName(KernelElec,KernelElecMod,KernelVdw,KernelVdwMod,KernelGeom):
+    ElecStr = 'Elec' + Abbreviation[KernelElec]
+    if(KernelElecMod!='None'):
+        ElecStr = ElecStr + Abbreviation[KernelElecMod]
+    VdwStr  = 'Vdw'  + Abbreviation[KernelVdw]
+    if(KernelVdwMod!='None'):
+        VdwStr = VdwStr + Abbreviation[KernelVdwMod]
+    GeomStr = 'Geom' + Abbreviation[KernelGeom[0]] + Abbreviation[KernelGeom[1]]
+    return 'nb_kernel_' + ElecStr + '_' + VdwStr + '_' + GeomStr + '_' + Arch
+
+def MakeKernelName(KernelElec,KernelElecMod,KernelVdw,KernelVdwMod,KernelGeom,KernelVF):
+    ElecStr = 'Elec' + Abbreviation[KernelElec]
+    if(KernelElecMod!='None'):
+        ElecStr = ElecStr + Abbreviation[KernelElecMod]
+    VdwStr  = 'Vdw'  + Abbreviation[KernelVdw]
+    if(KernelVdwMod!='None'):
+        VdwStr = VdwStr + Abbreviation[KernelVdwMod]
+    GeomStr = 'Geom' + Abbreviation[KernelGeom[0]] + Abbreviation[KernelGeom[1]]
+    VFStr   = Abbreviation[KernelVF]
+    return 'nb_kernel_' + ElecStr + '_' + VdwStr + '_' + GeomStr + '_' + VFStr + '_' + Arch
+
+# Return a string with a declaration to use for the kernel;
+# this will be a sequence of string combinations as well as the actual function name
+# Dont worry about field widths - that is just pretty-printing for the header!
+def MakeKernelDecl(KernelName,KernelElec,KernelElecMod,KernelVdw,KernelVdwMod,KernelGeom,KernelOther,KernelVF):
+    KernelStr   = '\"'+KernelName+'\"'
+    ArchStr     = '\"'+Arch+'\"'
+    ElecStr     = '\"'+KernelElec+'\"'
+    ElecModStr  = '\"'+KernelElecMod+'\"'
+    VdwStr      = '\"'+KernelVdw+'\"'
+    VdwModStr   = '\"'+KernelVdwMod+'\"'
+    GeomStr     = '\"'+KernelGeom[0]+KernelGeom[1]+'\"'
+    OtherStr    = '\"'+KernelOther+'\"'
+    VFStr       = '\"'+KernelVF+'\"'
+
+    ThisSpec = ArchStr+', '+ElecStr+', '+ElecModStr+', '+VdwStr+', '+VdwModStr+', '+GeomStr+', '+OtherStr+', '+VFStr
+    ThisDecl = '    { '+KernelName+', '+KernelStr+', '+ThisSpec+' }'
+    return ThisDecl
+
+
+# Returns 1 if this kernel should be created, 0 if we should skip it
+# This routine is not critical - it is not the end of the world if we create more kernels,
+# but since the number is pretty large we save both space and compile-time by reducing it a bit.
+def KeepKernel(KernelElec,KernelElecMod,KernelVdw,KernelVdwMod,KernelGeom,KernelVF):
+
+    # No need for kernels without interactions
+    if(KernelElec=='None' and KernelVdw=='None'):
+        return 0
+
+    # No need for modifiers without interactions
+    if((KernelElec=='None' and KernelElecMod!='None') or (KernelVdw=='None' and KernelVdwMod!='None')):
+        return 0
+
+    # No need for LJ-only water optimization, or water optimization with implicit solvent.
+    if('Water' in KernelGeom[0] and (KernelElec=='None' or 'GeneralizedBorn' in KernelElec)):
+        return 0
+
+    # Non-matching table settings are pointless
+    if( ('Table' in KernelElec) and ('Table' in KernelVdw) and KernelElec!=KernelVdw ):
+        return 0
+
+    # Try to reduce the number of different switch/shift options to get a reasonable number of kernels
+    # For electrostatics, reaction-field can use 'exactcutoff', and ewald can use switch or shift.
+    if(KernelElecMod=='ExactCutoff' and KernelElec!='ReactionField'):
+        return 0
+    if(KernelElecMod in ['PotentialShift','PotentialSwitch'] and KernelElec!='Ewald'):
+        return 0
+    # For Vdw, we support switch and shift for Lennard-Jones/Buckingham
+    if((KernelVdwMod=='ExactCutoff') or
+       (KernelVdwMod in ['PotentialShift','PotentialSwitch'] and KernelVdw not in ['LennardJones','Buckingham'])):
+        return 0
+
+    # Choose either switch or shift and don't mix them...
+    if((KernelElecMod=='PotentialShift' and KernelVdwMod=='PotentialSwitch') or
+       (KernelElecMod=='PotentialSwitch' and KernelVdwMod=='PotentialShift')):
+        return 0
+
+    # Don't use a Vdw kernel with a modifier if the electrostatics one does not have one
+    if(KernelElec!='None' and KernelElecMod=='None' and KernelVdwMod!='None'):
+        return 0
+
+    # Don't use an electrostatics kernel with a modifier if the vdw one does not have one,
+    # unless the electrostatics one is reaction-field with exact cutoff.
+    if(KernelVdw!='None' and KernelVdwMod=='None' and KernelElecMod!='None'):
+        if(KernelElec=='ReactionField' and KernelVdw!='CubicSplineTable'):
+            return 0
+        elif(KernelElec!='ReactionField'):
+            return 0
+
+    return 1
+
+
+
+#
+# The preprocessor will automatically expand the interactions for water and other
+# geometries inside the kernel, but to get this right we need to setup a couple
+# of defines - we do them in a separate routine to keep the main loop clean.
+#
+# While this routine might look a bit complex it is actually quite straightforward,
+# and the best news is that you wont have to modify _anything_ for a new geometry
+# as long as you correctly define its Electrostatics/Vdw geometry in the lists above!
+#
+def SetDefines(KernelElec,KernelElecMod,KernelVdw,KernelVdwMod,KernelGeom,KernelVF,defines):
+    # What is the _name_ for the i/j group geometry?
+    igeometry            = KernelGeom[0]
+    jgeometry            = KernelGeom[1]
+    # define so we can access it in the source when the preprocessor runs
+    defines['GEOMETRY_I'] = igeometry
+    defines['GEOMETRY_J'] = jgeometry
+
+    # For the i/j groups, extract a python list of which sites have electrostatics
+    # For SPC/TIP3p this will be [1,1,1], while TIP4p (no elec on first site) will be [0,1,1,1]
+    ielec                = GeometryElectrostatics[igeometry]
+    jelec                = GeometryElectrostatics[jgeometry]
+    # Zero out the corresponding lists in case we dont do Elec
+    if(KernelElec=='None'):
+        ielec = []
+        jelec = []
+
+    # Extract similar interaction lists for Vdw interactions (example for SPC: [1,0,0])
+    iVdw                 = GeometryVdw[igeometry]
+    jVdw                 = GeometryVdw[jgeometry]
+
+    # Zero out the corresponding lists in case we dont do Vdw
+    if(KernelVdw=='None'):
+        iVdw = []
+        jVdw = []
+
+    # iany[] and jany[] contains lists of the particles actually used (for interactions) in this kernel
+    iany = list(set(ielec+iVdw))  # convert to+from set to make elements unique
+    jany = list(set(jelec+jVdw))
+
+    defines['PARTICLES_ELEC_I'] = ielec
+    defines['PARTICLES_ELEC_J'] = jelec
+    defines['PARTICLES_VDW_I']  = iVdw
+    defines['PARTICLES_VDW_J']  = jVdw
+    defines['PARTICLES_I']      = iany
+    defines['PARTICLES_J']      = jany
+
+    # elecij,Vdwij are sets with pairs of particles for which the corresponding interaction is done
+    # (and anyij again corresponds to either electrostatics or Vdw)
+    elecij = []
+    Vdwij  = []
+    anyij  = []
+
+    for i in ielec:
+        for j in jelec:
+            elecij.append([i,j])
+
+    for i in iVdw:
+        for j in jVdw:
+            Vdwij.append([i,j])
+
+    for i in iany:
+        for j in jany:
+            if [i,j] in elecij or [i,j] in Vdwij:
+                anyij.append([i,j])
+
+    defines['PAIRS_IJ']     = anyij
+
+    # Make an 2d list-of-distance-properties-to-calculate for i,j
+    ni = max(iany)+1
+    nj = max(jany)+1
+    # Each element properties[i][j] is an empty list
+    properties = [ [ [] for j in range(0,nj) ] for i in range (0,ni) ]
+    # Add properties to each set
+    for i in range(0,ni):
+        for j in range(0,nj):
+            if [i,j] in elecij:
+                properties[i][j] = properties[i][j] + ['electrostatics'] + ElectrostaticsList[KernelElec] + ModifierList[KernelElecMod]
+            if [i,j] in Vdwij:
+                properties[i][j] = properties[i][j] + ['vdw'] + VdwList[KernelVdw] + ModifierList[KernelVdwMod]
+            # Add rinv if we need r
+            if 'r' in properties[i][j]:
+                properties[i][j] = properties[i][j] + ['rinv']
+            # Add rsq if we need rinv or rinsq
+            if 'rinv' in properties[i][j] or 'rinvsq' in properties[i][j]:
+                properties[i][j] = properties[i][j] + ['rsq']
+
+    defines['INTERACTION_FLAGS']    = properties
+
+
+
+def PrintStatistics(ratio):
+    ratio = 100.0*ratio
+    print '\rGenerating %s nonbonded kernels... %5.1f%%' % (Arch,ratio),
+    sys.stdout.flush()
+
+
+
+defines = {}
+kerneldecl = []
+
+cnt     = 0.0
+nelec   = len(ElectrostaticsList)
+nVdw    = len(VdwList)
+nmod    = len(ModifierList)
+ngeom   = len(GeometryNameList)
+
+ntot    = nelec*nmod*nVdw*nmod*ngeom
+
+numKernels = 0
+
+fpdecl = open('nb_kernel_' + Arch + '.c','w')
+fpdecl.write( FileHeader )
+fpdecl.write( '#ifndef nb_kernel_' + Arch + '_h\n' )
+fpdecl.write( '#define nb_kernel_' + Arch + '_h\n\n' )
+fpdecl.write( '#include "../nb_kernel.h"\n\n' )
+
+for KernelElec in ElectrostaticsList:
+    defines['KERNEL_ELEC'] = KernelElec
+
+    for KernelElecMod in ModifierList:
+        defines['KERNEL_MOD_ELEC'] = KernelElecMod
+
+        for KernelVdw in VdwList:
+            defines['KERNEL_VDW'] = KernelVdw
+
+            for KernelVdwMod in ModifierList:
+                defines['KERNEL_MOD_VDW'] = KernelVdwMod
+
+                for KernelGeom in GeometryNameList:
+
+                    cnt += 1
+                    KernelFilename = MakeKernelFileName(KernelElec,KernelElecMod,KernelVdw,KernelVdwMod,KernelGeom) + '.c'
+                    fpkernel = open(KernelFilename,'w')
+                    defines['INCLUDE_HEADER'] = 1  # Include header first time in new file
+                    DoHeader = 1
+
+                    for KernelVF in VFList:
+
+                        KernelName = MakeKernelName(KernelElec,KernelElecMod,KernelVdw,KernelVdwMod,KernelGeom,KernelVF)
+
+                        defines['KERNEL_NAME'] = KernelName
+                        defines['KERNEL_VF']   = KernelVF
+
+                        # Check if this is a valid/sane/usable combination
+                        if not KeepKernel(KernelElec,KernelElecMod,KernelVdw,KernelVdwMod,KernelGeom,KernelVF):
+                            continue;
+
+                        # The overall kernel settings determine what the _kernel_ calculates, but for the water
+                        # kernels this does not mean that every pairwise interaction has e.g. Vdw interactions.
+                        # This routine sets defines of what to calculate for each pair of particles in those cases.
+                        SetDefines(KernelElec,KernelElecMod,KernelVdw,KernelVdwMod,KernelGeom,KernelVF,defines)
+
+                        if(DoHeader==1):
+                            fpkernel.write( FileHeader )
+
+                        gmxpreprocess('nb_kernel_template_' + Arch + '.pre', KernelName+'.tmp' , defines, force=1,contentType='C')
+                        numKernels = numKernels + 1
+
+                        defines['INCLUDE_HEADER'] = 0   # Header has been included once now
+                        DoHeader=0
+
+                        # Append temp file contents to the common kernelfile
+                        fptmp = open(KernelName+'.tmp','r')
+                        fpkernel.writelines(fptmp.readlines())
+                        fptmp.close()
+                        os.remove(KernelName+'.tmp')
+
+                        # Add a declaration for this kernel
+                        fpdecl.write('nb_kernel_t ' + KernelName + ';\n');
+
+                        # Add declaration to the buffer
+                        KernelOther=''
+                        kerneldecl.append(MakeKernelDecl(KernelName,KernelElec,KernelElecMod,KernelVdw,KernelVdwMod,KernelGeom,KernelOther,KernelVF))
+
+                    filesize = fpkernel.tell()
+                    fpkernel.close()
+                    if(filesize==0):
+                        os.remove(KernelFilename)
+
+                    PrintStatistics(cnt/ntot)
+                pass
+            pass
+        pass
+    pass
+pass
+
+# Write out the list of settings and corresponding kernels to the declaration file
+fpdecl.write( '\n\n' )
+fpdecl.write( 'nb_kernel_info_t\n' )
+fpdecl.write( 'kernellist_'+Arch+'[] =\n' )
+fpdecl.write( '{\n' )
+for decl in kerneldecl[0:-1]:
+    fpdecl.write( decl + ',\n' )
+fpdecl.write( kerneldecl[-1] + '\n' )
+fpdecl.write( '};\n\n' )
+fpdecl.write( 'int\n' )
+fpdecl.write( 'kernellist_'+Arch+'_size = sizeof(kernellist_'+Arch+')/sizeof(kernellist_'+Arch+'[0]);\n\n')
+fpdecl.write( '#endif\n')
+fpdecl.close()
diff --git a/src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecCSTab_VdwCSTab_GeomP1P1_sparc64_hpc_ace_double.c b/src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecCSTab_VdwCSTab_GeomP1P1_sparc64_hpc_ace_double.c
new file mode 100644 (file)
index 0000000..4b3773d
--- /dev/null
@@ -0,0 +1,711 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2012, by the GROMACS development team, led by
+ * David van der Spoel, Berk Hess, Erik Lindahl, and including many
+ * others, as listed in the AUTHORS file in the top-level source
+ * directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*
+ * Note: this file was generated by the GROMACS sparc64_hpc_ace_double kernel generator.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "kernelutil_sparc64_hpc_ace_double.h"
+
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecCSTab_VdwCSTab_GeomP1P1_VF_sparc64_hpc_ace_double
+ * Electrostatics interaction: CubicSplineTable
+ * VdW interaction:            CubicSplineTable
+ * Geometry:                   Particle-Particle
+ * Calculate force/pot:        PotentialAndForce
+ */
+void
+nb_kernel_ElecCSTab_VdwCSTab_GeomP1P1_VF_sparc64_hpc_ace_double
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwjidx0A,vdwjidx0B;
+    _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
+    _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
+    _fjsp_v2r8       rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF,twovfeps;
+    real             *vftab;
+    _fjsp_v2r8       itab_tmp;
+    _fjsp_v2r8       dummy_mask,cutoff_mask;
+    _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+    _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+    union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+
+    vftab            = kernel_data->table_elec_vdw->data;
+    vftabscale       = gmx_fjsp_set1_v2r8(kernel_data->table_elec_vdw->scale);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_fjsp_load_shift_and_1rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,&ix0,&iy0,&iz0);
+
+        fix0             = _fjsp_setzero_v2r8();
+        fiy0             = _fjsp_setzero_v2r8();
+        fiz0             = _fjsp_setzero_v2r8();
+
+        /* Load parameters for i particles */
+        iq0              = _fjsp_mul_v2r8(facel,gmx_fjsp_load1_v2r8(charge+inr+0));
+        vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+        /* Reset potential sums */
+        velecsum         = _fjsp_setzero_v2r8();
+        vvdwsum          = _fjsp_setzero_v2r8();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_fjsp_load_2real_swizzle_v2r8(charge+jnrA+0,charge+jnrB+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _fjsp_mul_v2r8(iq0,jq0);
+            gmx_fjsp_load_2pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,&c6_00,&c12_00);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r00,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 12;
+            vfconv.i[1]     *= 12;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            velec            = _fjsp_mul_v2r8(qq00,VV);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq00,FF),_fjsp_mul_v2r8(vftabscale,rinv00)));
+
+            /* CUBIC SPLINE TABLE DISPERSION */
+            vfconv.i[0]       += 4;
+            vfconv.i[1]       += 4;
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 2 );
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 2 );
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            vvdw6            = _fjsp_mul_v2r8(c6_00,VV);
+            FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+            fvdw6            = _fjsp_mul_v2r8(c6_00,FF);
+
+            /* CUBIC SPLINE TABLE REPULSION */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 4 );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 4 );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 6 );
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 6 );
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            vvdw12           = _fjsp_mul_v2r8(c12_00,VV);
+            FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+            fvdw12           = _fjsp_mul_v2r8(c12_00,FF);
+            vvdw             = _fjsp_add_v2r8(vvdw12,vvdw6);
+            fvdw             = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_add_v2r8(fvdw6,fvdw12),_fjsp_mul_v2r8(vftabscale,rinv00)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+            vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
+
+            fscal            = _fjsp_add_v2r8(felec,fvdw);
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            gmx_fjsp_decrement_fma_1rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fscal,dx00,dy00,dz00);
+
+            /* Inner loop uses 76 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+
+            /* Load parameters for j particles */
+            jq0              = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),charge+jnrA+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _fjsp_mul_v2r8(iq0,jq0);
+            gmx_fjsp_load_1pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,&c6_00,&c12_00);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r00,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 12;
+            vfconv.i[1]     *= 12;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            velec            = _fjsp_mul_v2r8(qq00,VV);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq00,FF),_fjsp_mul_v2r8(vftabscale,rinv00)));
+
+            /* CUBIC SPLINE TABLE DISPERSION */
+            vfconv.i[0]       += 4;
+            vfconv.i[1]       += 4;
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 2 );
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            vvdw6            = _fjsp_mul_v2r8(c6_00,VV);
+            FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+            fvdw6            = _fjsp_mul_v2r8(c6_00,FF);
+
+            /* CUBIC SPLINE TABLE REPULSION */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 4 );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 6 );
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            vvdw12           = _fjsp_mul_v2r8(c12_00,VV);
+            FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+            fvdw12           = _fjsp_mul_v2r8(c12_00,FF);
+            vvdw             = _fjsp_add_v2r8(vvdw12,vvdw6);
+            fvdw             = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_add_v2r8(fvdw6,fvdw12),_fjsp_mul_v2r8(vftabscale,rinv00)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+            vvdw             = _fjsp_unpacklo_v2r8(vvdw,_fjsp_setzero_v2r8());
+            vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
+
+            fscal            = _fjsp_add_v2r8(felec,fvdw);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            gmx_fjsp_decrement_fma_1rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fscal,dx00,dy00,dz00);
+
+            /* Inner loop uses 76 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_fjsp_update_iforce_1atom_swizzle_v2r8(fix0,fiy0,fiz0,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        ggid                        = gid[iidx];
+        /* Update potential energies */
+        gmx_fjsp_update_1pot_v2r8(velecsum,kernel_data->energygrp_elec+ggid);
+        gmx_fjsp_update_1pot_v2r8(vvdwsum,kernel_data->energygrp_vdw+ggid);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 9 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_VF,outeriter*9 + inneriter*76);
+}
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecCSTab_VdwCSTab_GeomP1P1_F_sparc64_hpc_ace_double
+ * Electrostatics interaction: CubicSplineTable
+ * VdW interaction:            CubicSplineTable
+ * Geometry:                   Particle-Particle
+ * Calculate force/pot:        Force
+ */
+void
+nb_kernel_ElecCSTab_VdwCSTab_GeomP1P1_F_sparc64_hpc_ace_double
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwjidx0A,vdwjidx0B;
+    _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
+    _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
+    _fjsp_v2r8       rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF,twovfeps;
+    real             *vftab;
+    _fjsp_v2r8       itab_tmp;
+    _fjsp_v2r8       dummy_mask,cutoff_mask;
+    _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+    _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+    union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+
+    vftab            = kernel_data->table_elec_vdw->data;
+    vftabscale       = gmx_fjsp_set1_v2r8(kernel_data->table_elec_vdw->scale);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_fjsp_load_shift_and_1rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,&ix0,&iy0,&iz0);
+
+        fix0             = _fjsp_setzero_v2r8();
+        fiy0             = _fjsp_setzero_v2r8();
+        fiz0             = _fjsp_setzero_v2r8();
+
+        /* Load parameters for i particles */
+        iq0              = _fjsp_mul_v2r8(facel,gmx_fjsp_load1_v2r8(charge+inr+0));
+        vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_fjsp_load_2real_swizzle_v2r8(charge+jnrA+0,charge+jnrB+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _fjsp_mul_v2r8(iq0,jq0);
+            gmx_fjsp_load_2pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,&c6_00,&c12_00);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r00,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 12;
+            vfconv.i[1]     *= 12;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq00,FF),_fjsp_mul_v2r8(vftabscale,rinv00)));
+
+            /* CUBIC SPLINE TABLE DISPERSION */
+            vfconv.i[0]       += 4;
+            vfconv.i[1]       += 4;
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 2 );
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 2 );
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+            FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+            fvdw6            = _fjsp_mul_v2r8(c6_00,FF);
+
+            /* CUBIC SPLINE TABLE REPULSION */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 4 );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 4 );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 6 );
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 6 );
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+            FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+            fvdw12           = _fjsp_mul_v2r8(c12_00,FF);
+            fvdw             = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_add_v2r8(fvdw6,fvdw12),_fjsp_mul_v2r8(vftabscale,rinv00)));
+
+            fscal            = _fjsp_add_v2r8(felec,fvdw);
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            gmx_fjsp_decrement_fma_1rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fscal,dx00,dy00,dz00);
+
+            /* Inner loop uses 64 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+
+            /* Load parameters for j particles */
+            jq0              = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),charge+jnrA+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _fjsp_mul_v2r8(iq0,jq0);
+            gmx_fjsp_load_1pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,&c6_00,&c12_00);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r00,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 12;
+            vfconv.i[1]     *= 12;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq00,FF),_fjsp_mul_v2r8(vftabscale,rinv00)));
+
+            /* CUBIC SPLINE TABLE DISPERSION */
+            vfconv.i[0]       += 4;
+            vfconv.i[1]       += 4;
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 2 );
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+            FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+            fvdw6            = _fjsp_mul_v2r8(c6_00,FF);
+
+            /* CUBIC SPLINE TABLE REPULSION */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 4 );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 6 );
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+            FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+            fvdw12           = _fjsp_mul_v2r8(c12_00,FF);
+            fvdw             = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_add_v2r8(fvdw6,fvdw12),_fjsp_mul_v2r8(vftabscale,rinv00)));
+
+            fscal            = _fjsp_add_v2r8(felec,fvdw);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            gmx_fjsp_decrement_fma_1rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fscal,dx00,dy00,dz00);
+
+            /* Inner loop uses 64 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_fjsp_update_iforce_1atom_swizzle_v2r8(fix0,fiy0,fiz0,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 7 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_F,outeriter*7 + inneriter*64);
+}
diff --git a/src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecCSTab_VdwCSTab_GeomW3P1_sparc64_hpc_ace_double.c b/src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecCSTab_VdwCSTab_GeomW3P1_sparc64_hpc_ace_double.c
new file mode 100644 (file)
index 0000000..66d4fc9
--- /dev/null
@@ -0,0 +1,1173 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2012, by the GROMACS development team, led by
+ * David van der Spoel, Berk Hess, Erik Lindahl, and including many
+ * others, as listed in the AUTHORS file in the top-level source
+ * directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*
+ * Note: this file was generated by the GROMACS sparc64_hpc_ace_double kernel generator.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "kernelutil_sparc64_hpc_ace_double.h"
+
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecCSTab_VdwCSTab_GeomW3P1_VF_sparc64_hpc_ace_double
+ * Electrostatics interaction: CubicSplineTable
+ * VdW interaction:            CubicSplineTable
+ * Geometry:                   Water3-Particle
+ * Calculate force/pot:        PotentialAndForce
+ */
+void
+nb_kernel_ElecCSTab_VdwCSTab_GeomW3P1_VF_sparc64_hpc_ace_double
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwjidx0A,vdwjidx0B;
+    _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    _fjsp_v2r8       dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    _fjsp_v2r8       dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
+    _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
+    _fjsp_v2r8       rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF,twovfeps;
+    real             *vftab;
+    _fjsp_v2r8       itab_tmp;
+    _fjsp_v2r8       dummy_mask,cutoff_mask;
+    _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+    _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+    union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+
+    vftab            = kernel_data->table_elec_vdw->data;
+    vftabscale       = gmx_fjsp_set1_v2r8(kernel_data->table_elec_vdw->scale);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq0              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+0]));
+    iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+    iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+    vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_fjsp_load_shift_and_3rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
+
+        fix0             = _fjsp_setzero_v2r8();
+        fiy0             = _fjsp_setzero_v2r8();
+        fiz0             = _fjsp_setzero_v2r8();
+        fix1             = _fjsp_setzero_v2r8();
+        fiy1             = _fjsp_setzero_v2r8();
+        fiz1             = _fjsp_setzero_v2r8();
+        fix2             = _fjsp_setzero_v2r8();
+        fiy2             = _fjsp_setzero_v2r8();
+        fiz2             = _fjsp_setzero_v2r8();
+
+        /* Reset potential sums */
+        velecsum         = _fjsp_setzero_v2r8();
+        vvdwsum          = _fjsp_setzero_v2r8();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_fjsp_load_2real_swizzle_v2r8(charge+jnrA+0,charge+jnrB+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _fjsp_mul_v2r8(iq0,jq0);
+            gmx_fjsp_load_2pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,&c6_00,&c12_00);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r00,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 12;
+            vfconv.i[1]     *= 12;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            velec            = _fjsp_mul_v2r8(qq00,VV);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq00,FF),_fjsp_mul_v2r8(vftabscale,rinv00)));
+
+            /* CUBIC SPLINE TABLE DISPERSION */
+            vfconv.i[0]       += 4;
+            vfconv.i[1]       += 4;
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 2 );
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 2 );
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            vvdw6            = _fjsp_mul_v2r8(c6_00,VV);
+            FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+            fvdw6            = _fjsp_mul_v2r8(c6_00,FF);
+
+            /* CUBIC SPLINE TABLE REPULSION */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 4 );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 4 );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 6 );
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 6 );
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            vvdw12           = _fjsp_mul_v2r8(c12_00,VV);
+            FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+            fvdw12           = _fjsp_mul_v2r8(c12_00,FF);
+            vvdw             = _fjsp_add_v2r8(vvdw12,vvdw6);
+            fvdw             = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_add_v2r8(fvdw6,fvdw12),_fjsp_mul_v2r8(vftabscale,rinv00)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+            vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
+
+            fscal            = _fjsp_add_v2r8(felec,fvdw);
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r10              = _fjsp_mul_v2r8(rsq10,rinv10);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _fjsp_mul_v2r8(iq1,jq0);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r10,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 12;
+            vfconv.i[1]     *= 12;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            velec            = _fjsp_mul_v2r8(qq10,VV);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq10,FF),_fjsp_mul_v2r8(vftabscale,rinv10)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r20              = _fjsp_mul_v2r8(rsq20,rinv20);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _fjsp_mul_v2r8(iq2,jq0);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r20,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 12;
+            vfconv.i[1]     *= 12;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            velec            = _fjsp_mul_v2r8(qq20,VV);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq20,FF),_fjsp_mul_v2r8(vftabscale,rinv20)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            gmx_fjsp_decrement_1rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 171 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+
+            /* Load parameters for j particles */
+            jq0              = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),charge+jnrA+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _fjsp_mul_v2r8(iq0,jq0);
+            gmx_fjsp_load_1pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,&c6_00,&c12_00);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r00,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 12;
+            vfconv.i[1]     *= 12;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            velec            = _fjsp_mul_v2r8(qq00,VV);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq00,FF),_fjsp_mul_v2r8(vftabscale,rinv00)));
+
+            /* CUBIC SPLINE TABLE DISPERSION */
+            vfconv.i[0]       += 4;
+            vfconv.i[1]       += 4;
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 2 );
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            vvdw6            = _fjsp_mul_v2r8(c6_00,VV);
+            FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+            fvdw6            = _fjsp_mul_v2r8(c6_00,FF);
+
+            /* CUBIC SPLINE TABLE REPULSION */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 4 );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 6 );
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            vvdw12           = _fjsp_mul_v2r8(c12_00,VV);
+            FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+            fvdw12           = _fjsp_mul_v2r8(c12_00,FF);
+            vvdw             = _fjsp_add_v2r8(vvdw12,vvdw6);
+            fvdw             = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_add_v2r8(fvdw6,fvdw12),_fjsp_mul_v2r8(vftabscale,rinv00)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+            vvdw             = _fjsp_unpacklo_v2r8(vvdw,_fjsp_setzero_v2r8());
+            vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
+
+            fscal            = _fjsp_add_v2r8(felec,fvdw);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r10              = _fjsp_mul_v2r8(rsq10,rinv10);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _fjsp_mul_v2r8(iq1,jq0);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r10,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 12;
+            vfconv.i[1]     *= 12;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            velec            = _fjsp_mul_v2r8(qq10,VV);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq10,FF),_fjsp_mul_v2r8(vftabscale,rinv10)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r20              = _fjsp_mul_v2r8(rsq20,rinv20);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _fjsp_mul_v2r8(iq2,jq0);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r20,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 12;
+            vfconv.i[1]     *= 12;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            velec            = _fjsp_mul_v2r8(qq20,VV);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq20,FF),_fjsp_mul_v2r8(vftabscale,rinv20)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            gmx_fjsp_decrement_1rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 171 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_fjsp_update_iforce_3atom_swizzle_v2r8(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        ggid                        = gid[iidx];
+        /* Update potential energies */
+        gmx_fjsp_update_1pot_v2r8(velecsum,kernel_data->energygrp_elec+ggid);
+        gmx_fjsp_update_1pot_v2r8(vvdwsum,kernel_data->energygrp_vdw+ggid);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 20 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3_VF,outeriter*20 + inneriter*171);
+}
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecCSTab_VdwCSTab_GeomW3P1_F_sparc64_hpc_ace_double
+ * Electrostatics interaction: CubicSplineTable
+ * VdW interaction:            CubicSplineTable
+ * Geometry:                   Water3-Particle
+ * Calculate force/pot:        Force
+ */
+void
+nb_kernel_ElecCSTab_VdwCSTab_GeomW3P1_F_sparc64_hpc_ace_double
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwjidx0A,vdwjidx0B;
+    _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    _fjsp_v2r8       dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    _fjsp_v2r8       dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
+    _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
+    _fjsp_v2r8       rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF,twovfeps;
+    real             *vftab;
+    _fjsp_v2r8       itab_tmp;
+    _fjsp_v2r8       dummy_mask,cutoff_mask;
+    _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+    _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+    union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+
+    vftab            = kernel_data->table_elec_vdw->data;
+    vftabscale       = gmx_fjsp_set1_v2r8(kernel_data->table_elec_vdw->scale);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq0              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+0]));
+    iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+    iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+    vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_fjsp_load_shift_and_3rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
+
+        fix0             = _fjsp_setzero_v2r8();
+        fiy0             = _fjsp_setzero_v2r8();
+        fiz0             = _fjsp_setzero_v2r8();
+        fix1             = _fjsp_setzero_v2r8();
+        fiy1             = _fjsp_setzero_v2r8();
+        fiz1             = _fjsp_setzero_v2r8();
+        fix2             = _fjsp_setzero_v2r8();
+        fiy2             = _fjsp_setzero_v2r8();
+        fiz2             = _fjsp_setzero_v2r8();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_fjsp_load_2real_swizzle_v2r8(charge+jnrA+0,charge+jnrB+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _fjsp_mul_v2r8(iq0,jq0);
+            gmx_fjsp_load_2pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,&c6_00,&c12_00);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r00,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 12;
+            vfconv.i[1]     *= 12;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq00,FF),_fjsp_mul_v2r8(vftabscale,rinv00)));
+
+            /* CUBIC SPLINE TABLE DISPERSION */
+            vfconv.i[0]       += 4;
+            vfconv.i[1]       += 4;
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 2 );
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 2 );
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+            FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+            fvdw6            = _fjsp_mul_v2r8(c6_00,FF);
+
+            /* CUBIC SPLINE TABLE REPULSION */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 4 );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 4 );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 6 );
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 6 );
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+            FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+            fvdw12           = _fjsp_mul_v2r8(c12_00,FF);
+            fvdw             = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_add_v2r8(fvdw6,fvdw12),_fjsp_mul_v2r8(vftabscale,rinv00)));
+
+            fscal            = _fjsp_add_v2r8(felec,fvdw);
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r10              = _fjsp_mul_v2r8(rsq10,rinv10);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _fjsp_mul_v2r8(iq1,jq0);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r10,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 12;
+            vfconv.i[1]     *= 12;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq10,FF),_fjsp_mul_v2r8(vftabscale,rinv10)));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r20              = _fjsp_mul_v2r8(rsq20,rinv20);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _fjsp_mul_v2r8(iq2,jq0);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r20,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 12;
+            vfconv.i[1]     *= 12;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq20,FF),_fjsp_mul_v2r8(vftabscale,rinv20)));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            gmx_fjsp_decrement_1rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 151 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+
+            /* Load parameters for j particles */
+            jq0              = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),charge+jnrA+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _fjsp_mul_v2r8(iq0,jq0);
+            gmx_fjsp_load_1pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,&c6_00,&c12_00);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r00,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 12;
+            vfconv.i[1]     *= 12;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq00,FF),_fjsp_mul_v2r8(vftabscale,rinv00)));
+
+            /* CUBIC SPLINE TABLE DISPERSION */
+            vfconv.i[0]       += 4;
+            vfconv.i[1]       += 4;
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 2 );
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+            FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+            fvdw6            = _fjsp_mul_v2r8(c6_00,FF);
+
+            /* CUBIC SPLINE TABLE REPULSION */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 4 );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 6 );
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+            FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+            fvdw12           = _fjsp_mul_v2r8(c12_00,FF);
+            fvdw             = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_add_v2r8(fvdw6,fvdw12),_fjsp_mul_v2r8(vftabscale,rinv00)));
+
+            fscal            = _fjsp_add_v2r8(felec,fvdw);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r10              = _fjsp_mul_v2r8(rsq10,rinv10);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _fjsp_mul_v2r8(iq1,jq0);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r10,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 12;
+            vfconv.i[1]     *= 12;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq10,FF),_fjsp_mul_v2r8(vftabscale,rinv10)));
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r20              = _fjsp_mul_v2r8(rsq20,rinv20);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _fjsp_mul_v2r8(iq2,jq0);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r20,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 12;
+            vfconv.i[1]     *= 12;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq20,FF),_fjsp_mul_v2r8(vftabscale,rinv20)));
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            gmx_fjsp_decrement_1rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 151 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_fjsp_update_iforce_3atom_swizzle_v2r8(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 18 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3_F,outeriter*18 + inneriter*151);
+}
diff --git a/src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecCSTab_VdwCSTab_GeomW3W3_sparc64_hpc_ace_double.c b/src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecCSTab_VdwCSTab_GeomW3W3_sparc64_hpc_ace_double.c
new file mode 100644 (file)
index 0000000..db19a17
--- /dev/null
@@ -0,0 +1,2311 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2012, by the GROMACS development team, led by
+ * David van der Spoel, Berk Hess, Erik Lindahl, and including many
+ * others, as listed in the AUTHORS file in the top-level source
+ * directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*
+ * Note: this file was generated by the GROMACS sparc64_hpc_ace_double kernel generator.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "kernelutil_sparc64_hpc_ace_double.h"
+
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecCSTab_VdwCSTab_GeomW3W3_VF_sparc64_hpc_ace_double
+ * Electrostatics interaction: CubicSplineTable
+ * VdW interaction:            CubicSplineTable
+ * Geometry:                   Water3-Water3
+ * Calculate force/pot:        PotentialAndForce
+ */
+void
+nb_kernel_ElecCSTab_VdwCSTab_GeomW3W3_VF_sparc64_hpc_ace_double
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwjidx0A,vdwjidx0B;
+    _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    int              vdwjidx1A,vdwjidx1B;
+    _fjsp_v2r8       jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+    int              vdwjidx2A,vdwjidx2B;
+    _fjsp_v2r8       jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+    _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    _fjsp_v2r8       dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
+    _fjsp_v2r8       dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
+    _fjsp_v2r8       dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    _fjsp_v2r8       dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+    _fjsp_v2r8       dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+    _fjsp_v2r8       dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    _fjsp_v2r8       dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+    _fjsp_v2r8       dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+    _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
+    _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
+    _fjsp_v2r8       rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF,twovfeps;
+    real             *vftab;
+    _fjsp_v2r8       itab_tmp;
+    _fjsp_v2r8       dummy_mask,cutoff_mask;
+    _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+    _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+    union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+
+    vftab            = kernel_data->table_elec_vdw->data;
+    vftabscale       = gmx_fjsp_set1_v2r8(kernel_data->table_elec_vdw->scale);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq0              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+0]));
+    iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+    iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+    vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+    jq0              = gmx_fjsp_set1_v2r8(charge[inr+0]);
+    jq1              = gmx_fjsp_set1_v2r8(charge[inr+1]);
+    jq2              = gmx_fjsp_set1_v2r8(charge[inr+2]);
+    vdwjidx0A        = 2*vdwtype[inr+0];
+    qq00             = _fjsp_mul_v2r8(iq0,jq0);
+    c6_00            = gmx_fjsp_set1_v2r8(vdwparam[vdwioffset0+vdwjidx0A]);
+    c12_00           = gmx_fjsp_set1_v2r8(vdwparam[vdwioffset0+vdwjidx0A+1]);
+    qq01             = _fjsp_mul_v2r8(iq0,jq1);
+    qq02             = _fjsp_mul_v2r8(iq0,jq2);
+    qq10             = _fjsp_mul_v2r8(iq1,jq0);
+    qq11             = _fjsp_mul_v2r8(iq1,jq1);
+    qq12             = _fjsp_mul_v2r8(iq1,jq2);
+    qq20             = _fjsp_mul_v2r8(iq2,jq0);
+    qq21             = _fjsp_mul_v2r8(iq2,jq1);
+    qq22             = _fjsp_mul_v2r8(iq2,jq2);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_fjsp_load_shift_and_3rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
+
+        fix0             = _fjsp_setzero_v2r8();
+        fiy0             = _fjsp_setzero_v2r8();
+        fiz0             = _fjsp_setzero_v2r8();
+        fix1             = _fjsp_setzero_v2r8();
+        fiy1             = _fjsp_setzero_v2r8();
+        fiz1             = _fjsp_setzero_v2r8();
+        fix2             = _fjsp_setzero_v2r8();
+        fiy2             = _fjsp_setzero_v2r8();
+        fiz2             = _fjsp_setzero_v2r8();
+
+        /* Reset potential sums */
+        velecsum         = _fjsp_setzero_v2r8();
+        vvdwsum          = _fjsp_setzero_v2r8();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_3rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx01             = _fjsp_sub_v2r8(ix0,jx1);
+            dy01             = _fjsp_sub_v2r8(iy0,jy1);
+            dz01             = _fjsp_sub_v2r8(iz0,jz1);
+            dx02             = _fjsp_sub_v2r8(ix0,jx2);
+            dy02             = _fjsp_sub_v2r8(iy0,jy2);
+            dz02             = _fjsp_sub_v2r8(iz0,jz2);
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx11             = _fjsp_sub_v2r8(ix1,jx1);
+            dy11             = _fjsp_sub_v2r8(iy1,jy1);
+            dz11             = _fjsp_sub_v2r8(iz1,jz1);
+            dx12             = _fjsp_sub_v2r8(ix1,jx2);
+            dy12             = _fjsp_sub_v2r8(iy1,jy2);
+            dz12             = _fjsp_sub_v2r8(iz1,jz2);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+            dx21             = _fjsp_sub_v2r8(ix2,jx1);
+            dy21             = _fjsp_sub_v2r8(iy2,jy1);
+            dz21             = _fjsp_sub_v2r8(iz2,jz1);
+            dx22             = _fjsp_sub_v2r8(ix2,jx2);
+            dy22             = _fjsp_sub_v2r8(iy2,jy2);
+            dz22             = _fjsp_sub_v2r8(iz2,jz2);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq01            = gmx_fjsp_calc_rsq_v2r8(dx01,dy01,dz01);
+            rsq02            = gmx_fjsp_calc_rsq_v2r8(dx02,dy02,dz02);
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+            rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+            rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+            rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv01           = gmx_fjsp_invsqrt_v2r8(rsq01);
+            rinv02           = gmx_fjsp_invsqrt_v2r8(rsq02);
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+            rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+            rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+            rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+            fjx1             = _fjsp_setzero_v2r8();
+            fjy1             = _fjsp_setzero_v2r8();
+            fjz1             = _fjsp_setzero_v2r8();
+            fjx2             = _fjsp_setzero_v2r8();
+            fjy2             = _fjsp_setzero_v2r8();
+            fjz2             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r00,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 12;
+            vfconv.i[1]     *= 12;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            velec            = _fjsp_mul_v2r8(qq00,VV);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq00,FF),_fjsp_mul_v2r8(vftabscale,rinv00)));
+
+            /* CUBIC SPLINE TABLE DISPERSION */
+            vfconv.i[0]       += 4;
+            vfconv.i[1]       += 4;
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 2 );
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 2 );
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            vvdw6            = _fjsp_mul_v2r8(c6_00,VV);
+            FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+            fvdw6            = _fjsp_mul_v2r8(c6_00,FF);
+
+            /* CUBIC SPLINE TABLE REPULSION */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 4 );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 4 );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 6 );
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 6 );
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            vvdw12           = _fjsp_mul_v2r8(c12_00,VV);
+            FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+            fvdw12           = _fjsp_mul_v2r8(c12_00,FF);
+            vvdw             = _fjsp_add_v2r8(vvdw12,vvdw6);
+            fvdw             = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_add_v2r8(fvdw6,fvdw12),_fjsp_mul_v2r8(vftabscale,rinv00)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+            vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
+
+            fscal            = _fjsp_add_v2r8(felec,fvdw);
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r01              = _fjsp_mul_v2r8(rsq01,rinv01);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r01,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 12;
+            vfconv.i[1]     *= 12;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            velec            = _fjsp_mul_v2r8(qq01,VV);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq01,FF),_fjsp_mul_v2r8(vftabscale,rinv01)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx01,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy01,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz01,fscal,fiz0);
+            
+            fjx1             = _fjsp_madd_v2r8(dx01,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy01,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz01,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r02              = _fjsp_mul_v2r8(rsq02,rinv02);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r02,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 12;
+            vfconv.i[1]     *= 12;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            velec            = _fjsp_mul_v2r8(qq02,VV);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq02,FF),_fjsp_mul_v2r8(vftabscale,rinv02)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx02,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy02,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz02,fscal,fiz0);
+            
+            fjx2             = _fjsp_madd_v2r8(dx02,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy02,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz02,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r10              = _fjsp_mul_v2r8(rsq10,rinv10);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r10,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 12;
+            vfconv.i[1]     *= 12;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            velec            = _fjsp_mul_v2r8(qq10,VV);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq10,FF),_fjsp_mul_v2r8(vftabscale,rinv10)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r11              = _fjsp_mul_v2r8(rsq11,rinv11);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r11,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 12;
+            vfconv.i[1]     *= 12;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            velec            = _fjsp_mul_v2r8(qq11,VV);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq11,FF),_fjsp_mul_v2r8(vftabscale,rinv11)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+            
+            fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r12              = _fjsp_mul_v2r8(rsq12,rinv12);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r12,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 12;
+            vfconv.i[1]     *= 12;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            velec            = _fjsp_mul_v2r8(qq12,VV);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq12,FF),_fjsp_mul_v2r8(vftabscale,rinv12)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+            
+            fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r20              = _fjsp_mul_v2r8(rsq20,rinv20);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r20,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 12;
+            vfconv.i[1]     *= 12;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            velec            = _fjsp_mul_v2r8(qq20,VV);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq20,FF),_fjsp_mul_v2r8(vftabscale,rinv20)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r21              = _fjsp_mul_v2r8(rsq21,rinv21);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r21,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 12;
+            vfconv.i[1]     *= 12;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            velec            = _fjsp_mul_v2r8(qq21,VV);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq21,FF),_fjsp_mul_v2r8(vftabscale,rinv21)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+            
+            fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r22              = _fjsp_mul_v2r8(rsq22,rinv22);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r22,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 12;
+            vfconv.i[1]     *= 12;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            velec            = _fjsp_mul_v2r8(qq22,VV);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq22,FF),_fjsp_mul_v2r8(vftabscale,rinv22)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+            
+            fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+
+            gmx_fjsp_decrement_3rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+
+            /* Inner loop uses 444 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_3rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx01             = _fjsp_sub_v2r8(ix0,jx1);
+            dy01             = _fjsp_sub_v2r8(iy0,jy1);
+            dz01             = _fjsp_sub_v2r8(iz0,jz1);
+            dx02             = _fjsp_sub_v2r8(ix0,jx2);
+            dy02             = _fjsp_sub_v2r8(iy0,jy2);
+            dz02             = _fjsp_sub_v2r8(iz0,jz2);
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx11             = _fjsp_sub_v2r8(ix1,jx1);
+            dy11             = _fjsp_sub_v2r8(iy1,jy1);
+            dz11             = _fjsp_sub_v2r8(iz1,jz1);
+            dx12             = _fjsp_sub_v2r8(ix1,jx2);
+            dy12             = _fjsp_sub_v2r8(iy1,jy2);
+            dz12             = _fjsp_sub_v2r8(iz1,jz2);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+            dx21             = _fjsp_sub_v2r8(ix2,jx1);
+            dy21             = _fjsp_sub_v2r8(iy2,jy1);
+            dz21             = _fjsp_sub_v2r8(iz2,jz1);
+            dx22             = _fjsp_sub_v2r8(ix2,jx2);
+            dy22             = _fjsp_sub_v2r8(iy2,jy2);
+            dz22             = _fjsp_sub_v2r8(iz2,jz2);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq01            = gmx_fjsp_calc_rsq_v2r8(dx01,dy01,dz01);
+            rsq02            = gmx_fjsp_calc_rsq_v2r8(dx02,dy02,dz02);
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+            rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+            rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+            rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv01           = gmx_fjsp_invsqrt_v2r8(rsq01);
+            rinv02           = gmx_fjsp_invsqrt_v2r8(rsq02);
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+            rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+            rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+            rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+            fjx1             = _fjsp_setzero_v2r8();
+            fjy1             = _fjsp_setzero_v2r8();
+            fjz1             = _fjsp_setzero_v2r8();
+            fjx2             = _fjsp_setzero_v2r8();
+            fjy2             = _fjsp_setzero_v2r8();
+            fjz2             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r00,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 12;
+            vfconv.i[1]     *= 12;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            velec            = _fjsp_mul_v2r8(qq00,VV);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq00,FF),_fjsp_mul_v2r8(vftabscale,rinv00)));
+
+            /* CUBIC SPLINE TABLE DISPERSION */
+            vfconv.i[0]       += 4;
+            vfconv.i[1]       += 4;
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 2 );
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            vvdw6            = _fjsp_mul_v2r8(c6_00,VV);
+            FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+            fvdw6            = _fjsp_mul_v2r8(c6_00,FF);
+
+            /* CUBIC SPLINE TABLE REPULSION */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 4 );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 6 );
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            vvdw12           = _fjsp_mul_v2r8(c12_00,VV);
+            FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+            fvdw12           = _fjsp_mul_v2r8(c12_00,FF);
+            vvdw             = _fjsp_add_v2r8(vvdw12,vvdw6);
+            fvdw             = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_add_v2r8(fvdw6,fvdw12),_fjsp_mul_v2r8(vftabscale,rinv00)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+            vvdw             = _fjsp_unpacklo_v2r8(vvdw,_fjsp_setzero_v2r8());
+            vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
+
+            fscal            = _fjsp_add_v2r8(felec,fvdw);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r01              = _fjsp_mul_v2r8(rsq01,rinv01);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r01,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 12;
+            vfconv.i[1]     *= 12;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            velec            = _fjsp_mul_v2r8(qq01,VV);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq01,FF),_fjsp_mul_v2r8(vftabscale,rinv01)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx01,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy01,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz01,fscal,fiz0);
+            
+            fjx1             = _fjsp_madd_v2r8(dx01,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy01,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz01,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r02              = _fjsp_mul_v2r8(rsq02,rinv02);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r02,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 12;
+            vfconv.i[1]     *= 12;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            velec            = _fjsp_mul_v2r8(qq02,VV);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq02,FF),_fjsp_mul_v2r8(vftabscale,rinv02)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx02,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy02,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz02,fscal,fiz0);
+            
+            fjx2             = _fjsp_madd_v2r8(dx02,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy02,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz02,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r10              = _fjsp_mul_v2r8(rsq10,rinv10);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r10,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 12;
+            vfconv.i[1]     *= 12;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            velec            = _fjsp_mul_v2r8(qq10,VV);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq10,FF),_fjsp_mul_v2r8(vftabscale,rinv10)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r11              = _fjsp_mul_v2r8(rsq11,rinv11);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r11,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 12;
+            vfconv.i[1]     *= 12;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            velec            = _fjsp_mul_v2r8(qq11,VV);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq11,FF),_fjsp_mul_v2r8(vftabscale,rinv11)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+            
+            fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r12              = _fjsp_mul_v2r8(rsq12,rinv12);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r12,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 12;
+            vfconv.i[1]     *= 12;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            velec            = _fjsp_mul_v2r8(qq12,VV);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq12,FF),_fjsp_mul_v2r8(vftabscale,rinv12)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+            
+            fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r20              = _fjsp_mul_v2r8(rsq20,rinv20);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r20,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 12;
+            vfconv.i[1]     *= 12;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            velec            = _fjsp_mul_v2r8(qq20,VV);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq20,FF),_fjsp_mul_v2r8(vftabscale,rinv20)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r21              = _fjsp_mul_v2r8(rsq21,rinv21);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r21,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 12;
+            vfconv.i[1]     *= 12;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            velec            = _fjsp_mul_v2r8(qq21,VV);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq21,FF),_fjsp_mul_v2r8(vftabscale,rinv21)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+            
+            fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r22              = _fjsp_mul_v2r8(rsq22,rinv22);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r22,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 12;
+            vfconv.i[1]     *= 12;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            velec            = _fjsp_mul_v2r8(qq22,VV);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq22,FF),_fjsp_mul_v2r8(vftabscale,rinv22)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+            
+            fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+
+            gmx_fjsp_decrement_3rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+
+            /* Inner loop uses 444 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_fjsp_update_iforce_3atom_swizzle_v2r8(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        ggid                        = gid[iidx];
+        /* Update potential energies */
+        gmx_fjsp_update_1pot_v2r8(velecsum,kernel_data->energygrp_elec+ggid);
+        gmx_fjsp_update_1pot_v2r8(vvdwsum,kernel_data->energygrp_vdw+ggid);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 20 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_VF,outeriter*20 + inneriter*444);
+}
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecCSTab_VdwCSTab_GeomW3W3_F_sparc64_hpc_ace_double
+ * Electrostatics interaction: CubicSplineTable
+ * VdW interaction:            CubicSplineTable
+ * Geometry:                   Water3-Water3
+ * Calculate force/pot:        Force
+ */
+void
+nb_kernel_ElecCSTab_VdwCSTab_GeomW3W3_F_sparc64_hpc_ace_double
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwjidx0A,vdwjidx0B;
+    _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    int              vdwjidx1A,vdwjidx1B;
+    _fjsp_v2r8       jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+    int              vdwjidx2A,vdwjidx2B;
+    _fjsp_v2r8       jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+    _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    _fjsp_v2r8       dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
+    _fjsp_v2r8       dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
+    _fjsp_v2r8       dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    _fjsp_v2r8       dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+    _fjsp_v2r8       dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+    _fjsp_v2r8       dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    _fjsp_v2r8       dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+    _fjsp_v2r8       dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+    _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
+    _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
+    _fjsp_v2r8       rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF,twovfeps;
+    real             *vftab;
+    _fjsp_v2r8       itab_tmp;
+    _fjsp_v2r8       dummy_mask,cutoff_mask;
+    _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+    _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+    union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+
+    vftab            = kernel_data->table_elec_vdw->data;
+    vftabscale       = gmx_fjsp_set1_v2r8(kernel_data->table_elec_vdw->scale);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq0              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+0]));
+    iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+    iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+    vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+    jq0              = gmx_fjsp_set1_v2r8(charge[inr+0]);
+    jq1              = gmx_fjsp_set1_v2r8(charge[inr+1]);
+    jq2              = gmx_fjsp_set1_v2r8(charge[inr+2]);
+    vdwjidx0A        = 2*vdwtype[inr+0];
+    qq00             = _fjsp_mul_v2r8(iq0,jq0);
+    c6_00            = gmx_fjsp_set1_v2r8(vdwparam[vdwioffset0+vdwjidx0A]);
+    c12_00           = gmx_fjsp_set1_v2r8(vdwparam[vdwioffset0+vdwjidx0A+1]);
+    qq01             = _fjsp_mul_v2r8(iq0,jq1);
+    qq02             = _fjsp_mul_v2r8(iq0,jq2);
+    qq10             = _fjsp_mul_v2r8(iq1,jq0);
+    qq11             = _fjsp_mul_v2r8(iq1,jq1);
+    qq12             = _fjsp_mul_v2r8(iq1,jq2);
+    qq20             = _fjsp_mul_v2r8(iq2,jq0);
+    qq21             = _fjsp_mul_v2r8(iq2,jq1);
+    qq22             = _fjsp_mul_v2r8(iq2,jq2);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_fjsp_load_shift_and_3rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
+
+        fix0             = _fjsp_setzero_v2r8();
+        fiy0             = _fjsp_setzero_v2r8();
+        fiz0             = _fjsp_setzero_v2r8();
+        fix1             = _fjsp_setzero_v2r8();
+        fiy1             = _fjsp_setzero_v2r8();
+        fiz1             = _fjsp_setzero_v2r8();
+        fix2             = _fjsp_setzero_v2r8();
+        fiy2             = _fjsp_setzero_v2r8();
+        fiz2             = _fjsp_setzero_v2r8();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_3rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx01             = _fjsp_sub_v2r8(ix0,jx1);
+            dy01             = _fjsp_sub_v2r8(iy0,jy1);
+            dz01             = _fjsp_sub_v2r8(iz0,jz1);
+            dx02             = _fjsp_sub_v2r8(ix0,jx2);
+            dy02             = _fjsp_sub_v2r8(iy0,jy2);
+            dz02             = _fjsp_sub_v2r8(iz0,jz2);
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx11             = _fjsp_sub_v2r8(ix1,jx1);
+            dy11             = _fjsp_sub_v2r8(iy1,jy1);
+            dz11             = _fjsp_sub_v2r8(iz1,jz1);
+            dx12             = _fjsp_sub_v2r8(ix1,jx2);
+            dy12             = _fjsp_sub_v2r8(iy1,jy2);
+            dz12             = _fjsp_sub_v2r8(iz1,jz2);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+            dx21             = _fjsp_sub_v2r8(ix2,jx1);
+            dy21             = _fjsp_sub_v2r8(iy2,jy1);
+            dz21             = _fjsp_sub_v2r8(iz2,jz1);
+            dx22             = _fjsp_sub_v2r8(ix2,jx2);
+            dy22             = _fjsp_sub_v2r8(iy2,jy2);
+            dz22             = _fjsp_sub_v2r8(iz2,jz2);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq01            = gmx_fjsp_calc_rsq_v2r8(dx01,dy01,dz01);
+            rsq02            = gmx_fjsp_calc_rsq_v2r8(dx02,dy02,dz02);
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+            rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+            rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+            rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv01           = gmx_fjsp_invsqrt_v2r8(rsq01);
+            rinv02           = gmx_fjsp_invsqrt_v2r8(rsq02);
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+            rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+            rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+            rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+            fjx1             = _fjsp_setzero_v2r8();
+            fjy1             = _fjsp_setzero_v2r8();
+            fjz1             = _fjsp_setzero_v2r8();
+            fjx2             = _fjsp_setzero_v2r8();
+            fjy2             = _fjsp_setzero_v2r8();
+            fjz2             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r00,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 12;
+            vfconv.i[1]     *= 12;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq00,FF),_fjsp_mul_v2r8(vftabscale,rinv00)));
+
+            /* CUBIC SPLINE TABLE DISPERSION */
+            vfconv.i[0]       += 4;
+            vfconv.i[1]       += 4;
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 2 );
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 2 );
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+            FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+            fvdw6            = _fjsp_mul_v2r8(c6_00,FF);
+
+            /* CUBIC SPLINE TABLE REPULSION */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 4 );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 4 );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 6 );
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 6 );
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+            FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+            fvdw12           = _fjsp_mul_v2r8(c12_00,FF);
+            fvdw             = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_add_v2r8(fvdw6,fvdw12),_fjsp_mul_v2r8(vftabscale,rinv00)));
+
+            fscal            = _fjsp_add_v2r8(felec,fvdw);
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r01              = _fjsp_mul_v2r8(rsq01,rinv01);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r01,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 12;
+            vfconv.i[1]     *= 12;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq01,FF),_fjsp_mul_v2r8(vftabscale,rinv01)));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx01,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy01,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz01,fscal,fiz0);
+            
+            fjx1             = _fjsp_madd_v2r8(dx01,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy01,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz01,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r02              = _fjsp_mul_v2r8(rsq02,rinv02);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r02,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 12;
+            vfconv.i[1]     *= 12;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq02,FF),_fjsp_mul_v2r8(vftabscale,rinv02)));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx02,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy02,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz02,fscal,fiz0);
+            
+            fjx2             = _fjsp_madd_v2r8(dx02,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy02,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz02,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r10              = _fjsp_mul_v2r8(rsq10,rinv10);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r10,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 12;
+            vfconv.i[1]     *= 12;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq10,FF),_fjsp_mul_v2r8(vftabscale,rinv10)));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r11              = _fjsp_mul_v2r8(rsq11,rinv11);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r11,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 12;
+            vfconv.i[1]     *= 12;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq11,FF),_fjsp_mul_v2r8(vftabscale,rinv11)));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+            
+            fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r12              = _fjsp_mul_v2r8(rsq12,rinv12);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r12,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 12;
+            vfconv.i[1]     *= 12;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq12,FF),_fjsp_mul_v2r8(vftabscale,rinv12)));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+            
+            fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r20              = _fjsp_mul_v2r8(rsq20,rinv20);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r20,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 12;
+            vfconv.i[1]     *= 12;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq20,FF),_fjsp_mul_v2r8(vftabscale,rinv20)));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r21              = _fjsp_mul_v2r8(rsq21,rinv21);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r21,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 12;
+            vfconv.i[1]     *= 12;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq21,FF),_fjsp_mul_v2r8(vftabscale,rinv21)));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+            
+            fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r22              = _fjsp_mul_v2r8(rsq22,rinv22);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r22,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 12;
+            vfconv.i[1]     *= 12;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq22,FF),_fjsp_mul_v2r8(vftabscale,rinv22)));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+            
+            fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+
+            gmx_fjsp_decrement_3rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+
+            /* Inner loop uses 400 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_3rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx01             = _fjsp_sub_v2r8(ix0,jx1);
+            dy01             = _fjsp_sub_v2r8(iy0,jy1);
+            dz01             = _fjsp_sub_v2r8(iz0,jz1);
+            dx02             = _fjsp_sub_v2r8(ix0,jx2);
+            dy02             = _fjsp_sub_v2r8(iy0,jy2);
+            dz02             = _fjsp_sub_v2r8(iz0,jz2);
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx11             = _fjsp_sub_v2r8(ix1,jx1);
+            dy11             = _fjsp_sub_v2r8(iy1,jy1);
+            dz11             = _fjsp_sub_v2r8(iz1,jz1);
+            dx12             = _fjsp_sub_v2r8(ix1,jx2);
+            dy12             = _fjsp_sub_v2r8(iy1,jy2);
+            dz12             = _fjsp_sub_v2r8(iz1,jz2);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+            dx21             = _fjsp_sub_v2r8(ix2,jx1);
+            dy21             = _fjsp_sub_v2r8(iy2,jy1);
+            dz21             = _fjsp_sub_v2r8(iz2,jz1);
+            dx22             = _fjsp_sub_v2r8(ix2,jx2);
+            dy22             = _fjsp_sub_v2r8(iy2,jy2);
+            dz22             = _fjsp_sub_v2r8(iz2,jz2);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq01            = gmx_fjsp_calc_rsq_v2r8(dx01,dy01,dz01);
+            rsq02            = gmx_fjsp_calc_rsq_v2r8(dx02,dy02,dz02);
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+            rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+            rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+            rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv01           = gmx_fjsp_invsqrt_v2r8(rsq01);
+            rinv02           = gmx_fjsp_invsqrt_v2r8(rsq02);
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+            rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+            rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+            rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+            fjx1             = _fjsp_setzero_v2r8();
+            fjy1             = _fjsp_setzero_v2r8();
+            fjz1             = _fjsp_setzero_v2r8();
+            fjx2             = _fjsp_setzero_v2r8();
+            fjy2             = _fjsp_setzero_v2r8();
+            fjz2             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r00,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 12;
+            vfconv.i[1]     *= 12;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq00,FF),_fjsp_mul_v2r8(vftabscale,rinv00)));
+
+            /* CUBIC SPLINE TABLE DISPERSION */
+            vfconv.i[0]       += 4;
+            vfconv.i[1]       += 4;
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 2 );
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+            FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+            fvdw6            = _fjsp_mul_v2r8(c6_00,FF);
+
+            /* CUBIC SPLINE TABLE REPULSION */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 4 );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 6 );
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+            FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+            fvdw12           = _fjsp_mul_v2r8(c12_00,FF);
+            fvdw             = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_add_v2r8(fvdw6,fvdw12),_fjsp_mul_v2r8(vftabscale,rinv00)));
+
+            fscal            = _fjsp_add_v2r8(felec,fvdw);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r01              = _fjsp_mul_v2r8(rsq01,rinv01);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r01,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 12;
+            vfconv.i[1]     *= 12;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq01,FF),_fjsp_mul_v2r8(vftabscale,rinv01)));
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx01,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy01,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz01,fscal,fiz0);
+            
+            fjx1             = _fjsp_madd_v2r8(dx01,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy01,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz01,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r02              = _fjsp_mul_v2r8(rsq02,rinv02);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r02,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 12;
+            vfconv.i[1]     *= 12;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq02,FF),_fjsp_mul_v2r8(vftabscale,rinv02)));
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx02,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy02,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz02,fscal,fiz0);
+            
+            fjx2             = _fjsp_madd_v2r8(dx02,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy02,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz02,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r10              = _fjsp_mul_v2r8(rsq10,rinv10);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r10,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 12;
+            vfconv.i[1]     *= 12;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq10,FF),_fjsp_mul_v2r8(vftabscale,rinv10)));
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r11              = _fjsp_mul_v2r8(rsq11,rinv11);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r11,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 12;
+            vfconv.i[1]     *= 12;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq11,FF),_fjsp_mul_v2r8(vftabscale,rinv11)));
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+            
+            fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r12              = _fjsp_mul_v2r8(rsq12,rinv12);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r12,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 12;
+            vfconv.i[1]     *= 12;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq12,FF),_fjsp_mul_v2r8(vftabscale,rinv12)));
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+            
+            fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r20              = _fjsp_mul_v2r8(rsq20,rinv20);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r20,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 12;
+            vfconv.i[1]     *= 12;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq20,FF),_fjsp_mul_v2r8(vftabscale,rinv20)));
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r21              = _fjsp_mul_v2r8(rsq21,rinv21);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r21,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 12;
+            vfconv.i[1]     *= 12;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq21,FF),_fjsp_mul_v2r8(vftabscale,rinv21)));
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+            
+            fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r22              = _fjsp_mul_v2r8(rsq22,rinv22);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r22,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 12;
+            vfconv.i[1]     *= 12;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq22,FF),_fjsp_mul_v2r8(vftabscale,rinv22)));
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+            
+            fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+
+            gmx_fjsp_decrement_3rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+
+            /* Inner loop uses 400 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_fjsp_update_iforce_3atom_swizzle_v2r8(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 18 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_F,outeriter*18 + inneriter*400);
+}
diff --git a/src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecCSTab_VdwCSTab_GeomW4P1_sparc64_hpc_ace_double.c b/src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecCSTab_VdwCSTab_GeomW4P1_sparc64_hpc_ace_double.c
new file mode 100644 (file)
index 0000000..70be5c8
--- /dev/null
@@ -0,0 +1,1329 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2012, by the GROMACS development team, led by
+ * David van der Spoel, Berk Hess, Erik Lindahl, and including many
+ * others, as listed in the AUTHORS file in the top-level source
+ * directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*
+ * Note: this file was generated by the GROMACS sparc64_hpc_ace_double kernel generator.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "kernelutil_sparc64_hpc_ace_double.h"
+
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecCSTab_VdwCSTab_GeomW4P1_VF_sparc64_hpc_ace_double
+ * Electrostatics interaction: CubicSplineTable
+ * VdW interaction:            CubicSplineTable
+ * Geometry:                   Water4-Particle
+ * Calculate force/pot:        PotentialAndForce
+ */
+void
+nb_kernel_ElecCSTab_VdwCSTab_GeomW4P1_VF_sparc64_hpc_ace_double
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwioffset3;
+    _fjsp_v2r8       ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+    int              vdwjidx0A,vdwjidx0B;
+    _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    _fjsp_v2r8       dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    _fjsp_v2r8       dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    _fjsp_v2r8       dx30,dy30,dz30,rsq30,rinv30,rinvsq30,r30,qq30,c6_30,c12_30;
+    _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
+    _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
+    _fjsp_v2r8       rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF,twovfeps;
+    real             *vftab;
+    _fjsp_v2r8       itab_tmp;
+    _fjsp_v2r8       dummy_mask,cutoff_mask;
+    _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+    _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+    union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+
+    vftab            = kernel_data->table_elec_vdw->data;
+    vftabscale       = gmx_fjsp_set1_v2r8(kernel_data->table_elec_vdw->scale);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+    iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+    iq3              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+3]));
+    vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_fjsp_load_shift_and_4rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
+
+        fix0             = _fjsp_setzero_v2r8();
+        fiy0             = _fjsp_setzero_v2r8();
+        fiz0             = _fjsp_setzero_v2r8();
+        fix1             = _fjsp_setzero_v2r8();
+        fiy1             = _fjsp_setzero_v2r8();
+        fiz1             = _fjsp_setzero_v2r8();
+        fix2             = _fjsp_setzero_v2r8();
+        fiy2             = _fjsp_setzero_v2r8();
+        fiz2             = _fjsp_setzero_v2r8();
+        fix3             = _fjsp_setzero_v2r8();
+        fiy3             = _fjsp_setzero_v2r8();
+        fiz3             = _fjsp_setzero_v2r8();
+
+        /* Reset potential sums */
+        velecsum         = _fjsp_setzero_v2r8();
+        vvdwsum          = _fjsp_setzero_v2r8();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+            dx30             = _fjsp_sub_v2r8(ix3,jx0);
+            dy30             = _fjsp_sub_v2r8(iy3,jy0);
+            dz30             = _fjsp_sub_v2r8(iz3,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+            rsq30            = gmx_fjsp_calc_rsq_v2r8(dx30,dy30,dz30);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+            rinv30           = gmx_fjsp_invsqrt_v2r8(rsq30);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_fjsp_load_2real_swizzle_v2r8(charge+jnrA+0,charge+jnrB+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            gmx_fjsp_load_2pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,&c6_00,&c12_00);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r00,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 12;
+            vfconv.i[1]     *= 12;
+
+            /* CUBIC SPLINE TABLE DISPERSION */
+            vfconv.i[0]       += 4;
+            vfconv.i[1]       += 4;
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 2 );
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 2 );
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            vvdw6            = _fjsp_mul_v2r8(c6_00,VV);
+            FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+            fvdw6            = _fjsp_mul_v2r8(c6_00,FF);
+
+            /* CUBIC SPLINE TABLE REPULSION */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 4 );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 4 );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 6 );
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 6 );
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            vvdw12           = _fjsp_mul_v2r8(c12_00,VV);
+            FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+            fvdw12           = _fjsp_mul_v2r8(c12_00,FF);
+            vvdw             = _fjsp_add_v2r8(vvdw12,vvdw6);
+            fvdw             = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_add_v2r8(fvdw6,fvdw12),_fjsp_mul_v2r8(vftabscale,rinv00)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
+
+            fscal            = fvdw;
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r10              = _fjsp_mul_v2r8(rsq10,rinv10);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _fjsp_mul_v2r8(iq1,jq0);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r10,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 12;
+            vfconv.i[1]     *= 12;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            velec            = _fjsp_mul_v2r8(qq10,VV);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq10,FF),_fjsp_mul_v2r8(vftabscale,rinv10)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r20              = _fjsp_mul_v2r8(rsq20,rinv20);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _fjsp_mul_v2r8(iq2,jq0);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r20,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 12;
+            vfconv.i[1]     *= 12;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            velec            = _fjsp_mul_v2r8(qq20,VV);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq20,FF),_fjsp_mul_v2r8(vftabscale,rinv20)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r30              = _fjsp_mul_v2r8(rsq30,rinv30);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq30             = _fjsp_mul_v2r8(iq3,jq0);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r30,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 12;
+            vfconv.i[1]     *= 12;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            velec            = _fjsp_mul_v2r8(qq30,VV);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq30,FF),_fjsp_mul_v2r8(vftabscale,rinv30)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx30,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy30,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz30,fscal,fiz3);
+            
+            fjx0             = _fjsp_madd_v2r8(dx30,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy30,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz30,fscal,fjz0);
+
+            gmx_fjsp_decrement_1rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 200 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+            dx30             = _fjsp_sub_v2r8(ix3,jx0);
+            dy30             = _fjsp_sub_v2r8(iy3,jy0);
+            dz30             = _fjsp_sub_v2r8(iz3,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+            rsq30            = gmx_fjsp_calc_rsq_v2r8(dx30,dy30,dz30);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+            rinv30           = gmx_fjsp_invsqrt_v2r8(rsq30);
+
+            /* Load parameters for j particles */
+            jq0              = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),charge+jnrA+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            gmx_fjsp_load_1pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,&c6_00,&c12_00);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r00,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 12;
+            vfconv.i[1]     *= 12;
+
+            /* CUBIC SPLINE TABLE DISPERSION */
+            vfconv.i[0]       += 4;
+            vfconv.i[1]       += 4;
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 2 );
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            vvdw6            = _fjsp_mul_v2r8(c6_00,VV);
+            FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+            fvdw6            = _fjsp_mul_v2r8(c6_00,FF);
+
+            /* CUBIC SPLINE TABLE REPULSION */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 4 );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 6 );
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            vvdw12           = _fjsp_mul_v2r8(c12_00,VV);
+            FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+            fvdw12           = _fjsp_mul_v2r8(c12_00,FF);
+            vvdw             = _fjsp_add_v2r8(vvdw12,vvdw6);
+            fvdw             = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_add_v2r8(fvdw6,fvdw12),_fjsp_mul_v2r8(vftabscale,rinv00)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            vvdw             = _fjsp_unpacklo_v2r8(vvdw,_fjsp_setzero_v2r8());
+            vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
+
+            fscal            = fvdw;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r10              = _fjsp_mul_v2r8(rsq10,rinv10);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _fjsp_mul_v2r8(iq1,jq0);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r10,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 12;
+            vfconv.i[1]     *= 12;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            velec            = _fjsp_mul_v2r8(qq10,VV);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq10,FF),_fjsp_mul_v2r8(vftabscale,rinv10)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r20              = _fjsp_mul_v2r8(rsq20,rinv20);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _fjsp_mul_v2r8(iq2,jq0);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r20,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 12;
+            vfconv.i[1]     *= 12;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            velec            = _fjsp_mul_v2r8(qq20,VV);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq20,FF),_fjsp_mul_v2r8(vftabscale,rinv20)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r30              = _fjsp_mul_v2r8(rsq30,rinv30);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq30             = _fjsp_mul_v2r8(iq3,jq0);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r30,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 12;
+            vfconv.i[1]     *= 12;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            velec            = _fjsp_mul_v2r8(qq30,VV);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq30,FF),_fjsp_mul_v2r8(vftabscale,rinv30)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx30,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy30,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz30,fscal,fiz3);
+            
+            fjx0             = _fjsp_madd_v2r8(dx30,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy30,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz30,fscal,fjz0);
+
+            gmx_fjsp_decrement_1rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 200 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_fjsp_update_iforce_4atom_swizzle_v2r8(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        ggid                        = gid[iidx];
+        /* Update potential energies */
+        gmx_fjsp_update_1pot_v2r8(velecsum,kernel_data->energygrp_elec+ggid);
+        gmx_fjsp_update_1pot_v2r8(vvdwsum,kernel_data->energygrp_vdw+ggid);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 26 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4_VF,outeriter*26 + inneriter*200);
+}
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecCSTab_VdwCSTab_GeomW4P1_F_sparc64_hpc_ace_double
+ * Electrostatics interaction: CubicSplineTable
+ * VdW interaction:            CubicSplineTable
+ * Geometry:                   Water4-Particle
+ * Calculate force/pot:        Force
+ */
+void
+nb_kernel_ElecCSTab_VdwCSTab_GeomW4P1_F_sparc64_hpc_ace_double
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwioffset3;
+    _fjsp_v2r8       ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+    int              vdwjidx0A,vdwjidx0B;
+    _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    _fjsp_v2r8       dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    _fjsp_v2r8       dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    _fjsp_v2r8       dx30,dy30,dz30,rsq30,rinv30,rinvsq30,r30,qq30,c6_30,c12_30;
+    _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
+    _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
+    _fjsp_v2r8       rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF,twovfeps;
+    real             *vftab;
+    _fjsp_v2r8       itab_tmp;
+    _fjsp_v2r8       dummy_mask,cutoff_mask;
+    _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+    _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+    union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+
+    vftab            = kernel_data->table_elec_vdw->data;
+    vftabscale       = gmx_fjsp_set1_v2r8(kernel_data->table_elec_vdw->scale);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+    iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+    iq3              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+3]));
+    vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_fjsp_load_shift_and_4rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
+
+        fix0             = _fjsp_setzero_v2r8();
+        fiy0             = _fjsp_setzero_v2r8();
+        fiz0             = _fjsp_setzero_v2r8();
+        fix1             = _fjsp_setzero_v2r8();
+        fiy1             = _fjsp_setzero_v2r8();
+        fiz1             = _fjsp_setzero_v2r8();
+        fix2             = _fjsp_setzero_v2r8();
+        fiy2             = _fjsp_setzero_v2r8();
+        fiz2             = _fjsp_setzero_v2r8();
+        fix3             = _fjsp_setzero_v2r8();
+        fiy3             = _fjsp_setzero_v2r8();
+        fiz3             = _fjsp_setzero_v2r8();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+            dx30             = _fjsp_sub_v2r8(ix3,jx0);
+            dy30             = _fjsp_sub_v2r8(iy3,jy0);
+            dz30             = _fjsp_sub_v2r8(iz3,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+            rsq30            = gmx_fjsp_calc_rsq_v2r8(dx30,dy30,dz30);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+            rinv30           = gmx_fjsp_invsqrt_v2r8(rsq30);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_fjsp_load_2real_swizzle_v2r8(charge+jnrA+0,charge+jnrB+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            gmx_fjsp_load_2pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,&c6_00,&c12_00);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r00,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 12;
+            vfconv.i[1]     *= 12;
+
+            /* CUBIC SPLINE TABLE DISPERSION */
+            vfconv.i[0]       += 4;
+            vfconv.i[1]       += 4;
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 2 );
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 2 );
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+            FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+            fvdw6            = _fjsp_mul_v2r8(c6_00,FF);
+
+            /* CUBIC SPLINE TABLE REPULSION */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 4 );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 4 );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 6 );
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 6 );
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+            FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+            fvdw12           = _fjsp_mul_v2r8(c12_00,FF);
+            fvdw             = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_add_v2r8(fvdw6,fvdw12),_fjsp_mul_v2r8(vftabscale,rinv00)));
+
+            fscal            = fvdw;
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r10              = _fjsp_mul_v2r8(rsq10,rinv10);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _fjsp_mul_v2r8(iq1,jq0);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r10,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 12;
+            vfconv.i[1]     *= 12;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq10,FF),_fjsp_mul_v2r8(vftabscale,rinv10)));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r20              = _fjsp_mul_v2r8(rsq20,rinv20);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _fjsp_mul_v2r8(iq2,jq0);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r20,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 12;
+            vfconv.i[1]     *= 12;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq20,FF),_fjsp_mul_v2r8(vftabscale,rinv20)));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r30              = _fjsp_mul_v2r8(rsq30,rinv30);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq30             = _fjsp_mul_v2r8(iq3,jq0);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r30,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 12;
+            vfconv.i[1]     *= 12;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq30,FF),_fjsp_mul_v2r8(vftabscale,rinv30)));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx30,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy30,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz30,fscal,fiz3);
+            
+            fjx0             = _fjsp_madd_v2r8(dx30,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy30,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz30,fscal,fjz0);
+
+            gmx_fjsp_decrement_1rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 180 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+            dx30             = _fjsp_sub_v2r8(ix3,jx0);
+            dy30             = _fjsp_sub_v2r8(iy3,jy0);
+            dz30             = _fjsp_sub_v2r8(iz3,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+            rsq30            = gmx_fjsp_calc_rsq_v2r8(dx30,dy30,dz30);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+            rinv30           = gmx_fjsp_invsqrt_v2r8(rsq30);
+
+            /* Load parameters for j particles */
+            jq0              = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),charge+jnrA+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            gmx_fjsp_load_1pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,&c6_00,&c12_00);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r00,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 12;
+            vfconv.i[1]     *= 12;
+
+            /* CUBIC SPLINE TABLE DISPERSION */
+            vfconv.i[0]       += 4;
+            vfconv.i[1]       += 4;
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 2 );
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+            FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+            fvdw6            = _fjsp_mul_v2r8(c6_00,FF);
+
+            /* CUBIC SPLINE TABLE REPULSION */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 4 );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 6 );
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+            FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+            fvdw12           = _fjsp_mul_v2r8(c12_00,FF);
+            fvdw             = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_add_v2r8(fvdw6,fvdw12),_fjsp_mul_v2r8(vftabscale,rinv00)));
+
+            fscal            = fvdw;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r10              = _fjsp_mul_v2r8(rsq10,rinv10);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _fjsp_mul_v2r8(iq1,jq0);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r10,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 12;
+            vfconv.i[1]     *= 12;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq10,FF),_fjsp_mul_v2r8(vftabscale,rinv10)));
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r20              = _fjsp_mul_v2r8(rsq20,rinv20);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _fjsp_mul_v2r8(iq2,jq0);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r20,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 12;
+            vfconv.i[1]     *= 12;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq20,FF),_fjsp_mul_v2r8(vftabscale,rinv20)));
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r30              = _fjsp_mul_v2r8(rsq30,rinv30);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq30             = _fjsp_mul_v2r8(iq3,jq0);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r30,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 12;
+            vfconv.i[1]     *= 12;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq30,FF),_fjsp_mul_v2r8(vftabscale,rinv30)));
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx30,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy30,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz30,fscal,fiz3);
+            
+            fjx0             = _fjsp_madd_v2r8(dx30,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy30,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz30,fscal,fjz0);
+
+            gmx_fjsp_decrement_1rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 180 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_fjsp_update_iforce_4atom_swizzle_v2r8(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 24 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4_F,outeriter*24 + inneriter*180);
+}
diff --git a/src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecCSTab_VdwCSTab_GeomW4W4_sparc64_hpc_ace_double.c b/src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecCSTab_VdwCSTab_GeomW4W4_sparc64_hpc_ace_double.c
new file mode 100644 (file)
index 0000000..7ef236b
--- /dev/null
@@ -0,0 +1,2479 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2012, by the GROMACS development team, led by
+ * David van der Spoel, Berk Hess, Erik Lindahl, and including many
+ * others, as listed in the AUTHORS file in the top-level source
+ * directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*
+ * Note: this file was generated by the GROMACS sparc64_hpc_ace_double kernel generator.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "kernelutil_sparc64_hpc_ace_double.h"
+
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecCSTab_VdwCSTab_GeomW4W4_VF_sparc64_hpc_ace_double
+ * Electrostatics interaction: CubicSplineTable
+ * VdW interaction:            CubicSplineTable
+ * Geometry:                   Water4-Water4
+ * Calculate force/pot:        PotentialAndForce
+ */
+void
+nb_kernel_ElecCSTab_VdwCSTab_GeomW4W4_VF_sparc64_hpc_ace_double
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwioffset3;
+    _fjsp_v2r8       ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+    int              vdwjidx0A,vdwjidx0B;
+    _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    int              vdwjidx1A,vdwjidx1B;
+    _fjsp_v2r8       jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+    int              vdwjidx2A,vdwjidx2B;
+    _fjsp_v2r8       jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+    int              vdwjidx3A,vdwjidx3B;
+    _fjsp_v2r8       jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
+    _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    _fjsp_v2r8       dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+    _fjsp_v2r8       dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+    _fjsp_v2r8       dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
+    _fjsp_v2r8       dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+    _fjsp_v2r8       dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+    _fjsp_v2r8       dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
+    _fjsp_v2r8       dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
+    _fjsp_v2r8       dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
+    _fjsp_v2r8       dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
+    _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
+    _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
+    _fjsp_v2r8       rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF,twovfeps;
+    real             *vftab;
+    _fjsp_v2r8       itab_tmp;
+    _fjsp_v2r8       dummy_mask,cutoff_mask;
+    _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+    _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+    union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+
+    vftab            = kernel_data->table_elec_vdw->data;
+    vftabscale       = gmx_fjsp_set1_v2r8(kernel_data->table_elec_vdw->scale);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+    iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+    iq3              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+3]));
+    vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+    jq1              = gmx_fjsp_set1_v2r8(charge[inr+1]);
+    jq2              = gmx_fjsp_set1_v2r8(charge[inr+2]);
+    jq3              = gmx_fjsp_set1_v2r8(charge[inr+3]);
+    vdwjidx0A        = 2*vdwtype[inr+0];
+    c6_00            = gmx_fjsp_set1_v2r8(vdwparam[vdwioffset0+vdwjidx0A]);
+    c12_00           = gmx_fjsp_set1_v2r8(vdwparam[vdwioffset0+vdwjidx0A+1]);
+    qq11             = _fjsp_mul_v2r8(iq1,jq1);
+    qq12             = _fjsp_mul_v2r8(iq1,jq2);
+    qq13             = _fjsp_mul_v2r8(iq1,jq3);
+    qq21             = _fjsp_mul_v2r8(iq2,jq1);
+    qq22             = _fjsp_mul_v2r8(iq2,jq2);
+    qq23             = _fjsp_mul_v2r8(iq2,jq3);
+    qq31             = _fjsp_mul_v2r8(iq3,jq1);
+    qq32             = _fjsp_mul_v2r8(iq3,jq2);
+    qq33             = _fjsp_mul_v2r8(iq3,jq3);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_fjsp_load_shift_and_4rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
+
+        fix0             = _fjsp_setzero_v2r8();
+        fiy0             = _fjsp_setzero_v2r8();
+        fiz0             = _fjsp_setzero_v2r8();
+        fix1             = _fjsp_setzero_v2r8();
+        fiy1             = _fjsp_setzero_v2r8();
+        fiz1             = _fjsp_setzero_v2r8();
+        fix2             = _fjsp_setzero_v2r8();
+        fiy2             = _fjsp_setzero_v2r8();
+        fiz2             = _fjsp_setzero_v2r8();
+        fix3             = _fjsp_setzero_v2r8();
+        fiy3             = _fjsp_setzero_v2r8();
+        fiz3             = _fjsp_setzero_v2r8();
+
+        /* Reset potential sums */
+        velecsum         = _fjsp_setzero_v2r8();
+        vvdwsum          = _fjsp_setzero_v2r8();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_4rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
+                                              &jy2,&jz2,&jx3,&jy3,&jz3);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx11             = _fjsp_sub_v2r8(ix1,jx1);
+            dy11             = _fjsp_sub_v2r8(iy1,jy1);
+            dz11             = _fjsp_sub_v2r8(iz1,jz1);
+            dx12             = _fjsp_sub_v2r8(ix1,jx2);
+            dy12             = _fjsp_sub_v2r8(iy1,jy2);
+            dz12             = _fjsp_sub_v2r8(iz1,jz2);
+            dx13             = _fjsp_sub_v2r8(ix1,jx3);
+            dy13             = _fjsp_sub_v2r8(iy1,jy3);
+            dz13             = _fjsp_sub_v2r8(iz1,jz3);
+            dx21             = _fjsp_sub_v2r8(ix2,jx1);
+            dy21             = _fjsp_sub_v2r8(iy2,jy1);
+            dz21             = _fjsp_sub_v2r8(iz2,jz1);
+            dx22             = _fjsp_sub_v2r8(ix2,jx2);
+            dy22             = _fjsp_sub_v2r8(iy2,jy2);
+            dz22             = _fjsp_sub_v2r8(iz2,jz2);
+            dx23             = _fjsp_sub_v2r8(ix2,jx3);
+            dy23             = _fjsp_sub_v2r8(iy2,jy3);
+            dz23             = _fjsp_sub_v2r8(iz2,jz3);
+            dx31             = _fjsp_sub_v2r8(ix3,jx1);
+            dy31             = _fjsp_sub_v2r8(iy3,jy1);
+            dz31             = _fjsp_sub_v2r8(iz3,jz1);
+            dx32             = _fjsp_sub_v2r8(ix3,jx2);
+            dy32             = _fjsp_sub_v2r8(iy3,jy2);
+            dz32             = _fjsp_sub_v2r8(iz3,jz2);
+            dx33             = _fjsp_sub_v2r8(ix3,jx3);
+            dy33             = _fjsp_sub_v2r8(iy3,jy3);
+            dz33             = _fjsp_sub_v2r8(iz3,jz3);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+            rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+            rsq13            = gmx_fjsp_calc_rsq_v2r8(dx13,dy13,dz13);
+            rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+            rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+            rsq23            = gmx_fjsp_calc_rsq_v2r8(dx23,dy23,dz23);
+            rsq31            = gmx_fjsp_calc_rsq_v2r8(dx31,dy31,dz31);
+            rsq32            = gmx_fjsp_calc_rsq_v2r8(dx32,dy32,dz32);
+            rsq33            = gmx_fjsp_calc_rsq_v2r8(dx33,dy33,dz33);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+            rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+            rinv13           = gmx_fjsp_invsqrt_v2r8(rsq13);
+            rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+            rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+            rinv23           = gmx_fjsp_invsqrt_v2r8(rsq23);
+            rinv31           = gmx_fjsp_invsqrt_v2r8(rsq31);
+            rinv32           = gmx_fjsp_invsqrt_v2r8(rsq32);
+            rinv33           = gmx_fjsp_invsqrt_v2r8(rsq33);
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+            fjx1             = _fjsp_setzero_v2r8();
+            fjy1             = _fjsp_setzero_v2r8();
+            fjz1             = _fjsp_setzero_v2r8();
+            fjx2             = _fjsp_setzero_v2r8();
+            fjy2             = _fjsp_setzero_v2r8();
+            fjz2             = _fjsp_setzero_v2r8();
+            fjx3             = _fjsp_setzero_v2r8();
+            fjy3             = _fjsp_setzero_v2r8();
+            fjz3             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r00,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 12;
+            vfconv.i[1]     *= 12;
+
+            /* CUBIC SPLINE TABLE DISPERSION */
+            vfconv.i[0]       += 4;
+            vfconv.i[1]       += 4;
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 2 );
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 2 );
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            vvdw6            = _fjsp_mul_v2r8(c6_00,VV);
+            FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+            fvdw6            = _fjsp_mul_v2r8(c6_00,FF);
+
+            /* CUBIC SPLINE TABLE REPULSION */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 4 );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 4 );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 6 );
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 6 );
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            vvdw12           = _fjsp_mul_v2r8(c12_00,VV);
+            FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+            fvdw12           = _fjsp_mul_v2r8(c12_00,FF);
+            vvdw             = _fjsp_add_v2r8(vvdw12,vvdw6);
+            fvdw             = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_add_v2r8(fvdw6,fvdw12),_fjsp_mul_v2r8(vftabscale,rinv00)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
+
+            fscal            = fvdw;
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r11              = _fjsp_mul_v2r8(rsq11,rinv11);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r11,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 12;
+            vfconv.i[1]     *= 12;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            velec            = _fjsp_mul_v2r8(qq11,VV);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq11,FF),_fjsp_mul_v2r8(vftabscale,rinv11)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+            
+            fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r12              = _fjsp_mul_v2r8(rsq12,rinv12);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r12,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 12;
+            vfconv.i[1]     *= 12;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            velec            = _fjsp_mul_v2r8(qq12,VV);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq12,FF),_fjsp_mul_v2r8(vftabscale,rinv12)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+            
+            fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r13              = _fjsp_mul_v2r8(rsq13,rinv13);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r13,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 12;
+            vfconv.i[1]     *= 12;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            velec            = _fjsp_mul_v2r8(qq13,VV);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq13,FF),_fjsp_mul_v2r8(vftabscale,rinv13)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx13,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy13,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz13,fscal,fiz1);
+            
+            fjx3             = _fjsp_madd_v2r8(dx13,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy13,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz13,fscal,fjz3);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r21              = _fjsp_mul_v2r8(rsq21,rinv21);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r21,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 12;
+            vfconv.i[1]     *= 12;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            velec            = _fjsp_mul_v2r8(qq21,VV);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq21,FF),_fjsp_mul_v2r8(vftabscale,rinv21)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+            
+            fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r22              = _fjsp_mul_v2r8(rsq22,rinv22);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r22,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 12;
+            vfconv.i[1]     *= 12;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            velec            = _fjsp_mul_v2r8(qq22,VV);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq22,FF),_fjsp_mul_v2r8(vftabscale,rinv22)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+            
+            fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r23              = _fjsp_mul_v2r8(rsq23,rinv23);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r23,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 12;
+            vfconv.i[1]     *= 12;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            velec            = _fjsp_mul_v2r8(qq23,VV);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq23,FF),_fjsp_mul_v2r8(vftabscale,rinv23)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx23,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy23,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz23,fscal,fiz2);
+            
+            fjx3             = _fjsp_madd_v2r8(dx23,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy23,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz23,fscal,fjz3);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r31              = _fjsp_mul_v2r8(rsq31,rinv31);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r31,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 12;
+            vfconv.i[1]     *= 12;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            velec            = _fjsp_mul_v2r8(qq31,VV);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq31,FF),_fjsp_mul_v2r8(vftabscale,rinv31)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx31,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy31,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz31,fscal,fiz3);
+            
+            fjx1             = _fjsp_madd_v2r8(dx31,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy31,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz31,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r32              = _fjsp_mul_v2r8(rsq32,rinv32);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r32,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 12;
+            vfconv.i[1]     *= 12;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            velec            = _fjsp_mul_v2r8(qq32,VV);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq32,FF),_fjsp_mul_v2r8(vftabscale,rinv32)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx32,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy32,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz32,fscal,fiz3);
+            
+            fjx2             = _fjsp_madd_v2r8(dx32,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy32,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz32,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r33              = _fjsp_mul_v2r8(rsq33,rinv33);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r33,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 12;
+            vfconv.i[1]     *= 12;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            velec            = _fjsp_mul_v2r8(qq33,VV);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq33,FF),_fjsp_mul_v2r8(vftabscale,rinv33)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx33,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy33,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz33,fscal,fiz3);
+            
+            fjx3             = _fjsp_madd_v2r8(dx33,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy33,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz33,fscal,fjz3);
+
+            gmx_fjsp_decrement_4rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+
+            /* Inner loop uses 476 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_4rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
+                                              &jy2,&jz2,&jx3,&jy3,&jz3);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx11             = _fjsp_sub_v2r8(ix1,jx1);
+            dy11             = _fjsp_sub_v2r8(iy1,jy1);
+            dz11             = _fjsp_sub_v2r8(iz1,jz1);
+            dx12             = _fjsp_sub_v2r8(ix1,jx2);
+            dy12             = _fjsp_sub_v2r8(iy1,jy2);
+            dz12             = _fjsp_sub_v2r8(iz1,jz2);
+            dx13             = _fjsp_sub_v2r8(ix1,jx3);
+            dy13             = _fjsp_sub_v2r8(iy1,jy3);
+            dz13             = _fjsp_sub_v2r8(iz1,jz3);
+            dx21             = _fjsp_sub_v2r8(ix2,jx1);
+            dy21             = _fjsp_sub_v2r8(iy2,jy1);
+            dz21             = _fjsp_sub_v2r8(iz2,jz1);
+            dx22             = _fjsp_sub_v2r8(ix2,jx2);
+            dy22             = _fjsp_sub_v2r8(iy2,jy2);
+            dz22             = _fjsp_sub_v2r8(iz2,jz2);
+            dx23             = _fjsp_sub_v2r8(ix2,jx3);
+            dy23             = _fjsp_sub_v2r8(iy2,jy3);
+            dz23             = _fjsp_sub_v2r8(iz2,jz3);
+            dx31             = _fjsp_sub_v2r8(ix3,jx1);
+            dy31             = _fjsp_sub_v2r8(iy3,jy1);
+            dz31             = _fjsp_sub_v2r8(iz3,jz1);
+            dx32             = _fjsp_sub_v2r8(ix3,jx2);
+            dy32             = _fjsp_sub_v2r8(iy3,jy2);
+            dz32             = _fjsp_sub_v2r8(iz3,jz2);
+            dx33             = _fjsp_sub_v2r8(ix3,jx3);
+            dy33             = _fjsp_sub_v2r8(iy3,jy3);
+            dz33             = _fjsp_sub_v2r8(iz3,jz3);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+            rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+            rsq13            = gmx_fjsp_calc_rsq_v2r8(dx13,dy13,dz13);
+            rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+            rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+            rsq23            = gmx_fjsp_calc_rsq_v2r8(dx23,dy23,dz23);
+            rsq31            = gmx_fjsp_calc_rsq_v2r8(dx31,dy31,dz31);
+            rsq32            = gmx_fjsp_calc_rsq_v2r8(dx32,dy32,dz32);
+            rsq33            = gmx_fjsp_calc_rsq_v2r8(dx33,dy33,dz33);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+            rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+            rinv13           = gmx_fjsp_invsqrt_v2r8(rsq13);
+            rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+            rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+            rinv23           = gmx_fjsp_invsqrt_v2r8(rsq23);
+            rinv31           = gmx_fjsp_invsqrt_v2r8(rsq31);
+            rinv32           = gmx_fjsp_invsqrt_v2r8(rsq32);
+            rinv33           = gmx_fjsp_invsqrt_v2r8(rsq33);
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+            fjx1             = _fjsp_setzero_v2r8();
+            fjy1             = _fjsp_setzero_v2r8();
+            fjz1             = _fjsp_setzero_v2r8();
+            fjx2             = _fjsp_setzero_v2r8();
+            fjy2             = _fjsp_setzero_v2r8();
+            fjz2             = _fjsp_setzero_v2r8();
+            fjx3             = _fjsp_setzero_v2r8();
+            fjy3             = _fjsp_setzero_v2r8();
+            fjz3             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r00,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 12;
+            vfconv.i[1]     *= 12;
+
+            /* CUBIC SPLINE TABLE DISPERSION */
+            vfconv.i[0]       += 4;
+            vfconv.i[1]       += 4;
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 2 );
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            vvdw6            = _fjsp_mul_v2r8(c6_00,VV);
+            FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+            fvdw6            = _fjsp_mul_v2r8(c6_00,FF);
+
+            /* CUBIC SPLINE TABLE REPULSION */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 4 );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 6 );
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            vvdw12           = _fjsp_mul_v2r8(c12_00,VV);
+            FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+            fvdw12           = _fjsp_mul_v2r8(c12_00,FF);
+            vvdw             = _fjsp_add_v2r8(vvdw12,vvdw6);
+            fvdw             = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_add_v2r8(fvdw6,fvdw12),_fjsp_mul_v2r8(vftabscale,rinv00)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            vvdw             = _fjsp_unpacklo_v2r8(vvdw,_fjsp_setzero_v2r8());
+            vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
+
+            fscal            = fvdw;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r11              = _fjsp_mul_v2r8(rsq11,rinv11);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r11,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 12;
+            vfconv.i[1]     *= 12;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            velec            = _fjsp_mul_v2r8(qq11,VV);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq11,FF),_fjsp_mul_v2r8(vftabscale,rinv11)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+            
+            fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r12              = _fjsp_mul_v2r8(rsq12,rinv12);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r12,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 12;
+            vfconv.i[1]     *= 12;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            velec            = _fjsp_mul_v2r8(qq12,VV);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq12,FF),_fjsp_mul_v2r8(vftabscale,rinv12)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+            
+            fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r13              = _fjsp_mul_v2r8(rsq13,rinv13);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r13,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 12;
+            vfconv.i[1]     *= 12;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            velec            = _fjsp_mul_v2r8(qq13,VV);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq13,FF),_fjsp_mul_v2r8(vftabscale,rinv13)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx13,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy13,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz13,fscal,fiz1);
+            
+            fjx3             = _fjsp_madd_v2r8(dx13,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy13,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz13,fscal,fjz3);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r21              = _fjsp_mul_v2r8(rsq21,rinv21);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r21,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 12;
+            vfconv.i[1]     *= 12;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            velec            = _fjsp_mul_v2r8(qq21,VV);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq21,FF),_fjsp_mul_v2r8(vftabscale,rinv21)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+            
+            fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r22              = _fjsp_mul_v2r8(rsq22,rinv22);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r22,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 12;
+            vfconv.i[1]     *= 12;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            velec            = _fjsp_mul_v2r8(qq22,VV);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq22,FF),_fjsp_mul_v2r8(vftabscale,rinv22)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+            
+            fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r23              = _fjsp_mul_v2r8(rsq23,rinv23);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r23,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 12;
+            vfconv.i[1]     *= 12;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            velec            = _fjsp_mul_v2r8(qq23,VV);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq23,FF),_fjsp_mul_v2r8(vftabscale,rinv23)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx23,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy23,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz23,fscal,fiz2);
+            
+            fjx3             = _fjsp_madd_v2r8(dx23,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy23,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz23,fscal,fjz3);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r31              = _fjsp_mul_v2r8(rsq31,rinv31);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r31,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 12;
+            vfconv.i[1]     *= 12;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            velec            = _fjsp_mul_v2r8(qq31,VV);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq31,FF),_fjsp_mul_v2r8(vftabscale,rinv31)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx31,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy31,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz31,fscal,fiz3);
+            
+            fjx1             = _fjsp_madd_v2r8(dx31,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy31,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz31,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r32              = _fjsp_mul_v2r8(rsq32,rinv32);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r32,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 12;
+            vfconv.i[1]     *= 12;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            velec            = _fjsp_mul_v2r8(qq32,VV);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq32,FF),_fjsp_mul_v2r8(vftabscale,rinv32)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx32,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy32,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz32,fscal,fiz3);
+            
+            fjx2             = _fjsp_madd_v2r8(dx32,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy32,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz32,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r33              = _fjsp_mul_v2r8(rsq33,rinv33);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r33,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 12;
+            vfconv.i[1]     *= 12;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            velec            = _fjsp_mul_v2r8(qq33,VV);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq33,FF),_fjsp_mul_v2r8(vftabscale,rinv33)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx33,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy33,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz33,fscal,fiz3);
+            
+            fjx3             = _fjsp_madd_v2r8(dx33,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy33,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz33,fscal,fjz3);
+
+            gmx_fjsp_decrement_4rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+
+            /* Inner loop uses 476 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_fjsp_update_iforce_4atom_swizzle_v2r8(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        ggid                        = gid[iidx];
+        /* Update potential energies */
+        gmx_fjsp_update_1pot_v2r8(velecsum,kernel_data->energygrp_elec+ggid);
+        gmx_fjsp_update_1pot_v2r8(vvdwsum,kernel_data->energygrp_vdw+ggid);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 26 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_VF,outeriter*26 + inneriter*476);
+}
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecCSTab_VdwCSTab_GeomW4W4_F_sparc64_hpc_ace_double
+ * Electrostatics interaction: CubicSplineTable
+ * VdW interaction:            CubicSplineTable
+ * Geometry:                   Water4-Water4
+ * Calculate force/pot:        Force
+ */
+void
+nb_kernel_ElecCSTab_VdwCSTab_GeomW4W4_F_sparc64_hpc_ace_double
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwioffset3;
+    _fjsp_v2r8       ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+    int              vdwjidx0A,vdwjidx0B;
+    _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    int              vdwjidx1A,vdwjidx1B;
+    _fjsp_v2r8       jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+    int              vdwjidx2A,vdwjidx2B;
+    _fjsp_v2r8       jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+    int              vdwjidx3A,vdwjidx3B;
+    _fjsp_v2r8       jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
+    _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    _fjsp_v2r8       dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+    _fjsp_v2r8       dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+    _fjsp_v2r8       dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
+    _fjsp_v2r8       dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+    _fjsp_v2r8       dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+    _fjsp_v2r8       dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
+    _fjsp_v2r8       dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
+    _fjsp_v2r8       dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
+    _fjsp_v2r8       dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
+    _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
+    _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
+    _fjsp_v2r8       rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF,twovfeps;
+    real             *vftab;
+    _fjsp_v2r8       itab_tmp;
+    _fjsp_v2r8       dummy_mask,cutoff_mask;
+    _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+    _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+    union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+
+    vftab            = kernel_data->table_elec_vdw->data;
+    vftabscale       = gmx_fjsp_set1_v2r8(kernel_data->table_elec_vdw->scale);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+    iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+    iq3              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+3]));
+    vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+    jq1              = gmx_fjsp_set1_v2r8(charge[inr+1]);
+    jq2              = gmx_fjsp_set1_v2r8(charge[inr+2]);
+    jq3              = gmx_fjsp_set1_v2r8(charge[inr+3]);
+    vdwjidx0A        = 2*vdwtype[inr+0];
+    c6_00            = gmx_fjsp_set1_v2r8(vdwparam[vdwioffset0+vdwjidx0A]);
+    c12_00           = gmx_fjsp_set1_v2r8(vdwparam[vdwioffset0+vdwjidx0A+1]);
+    qq11             = _fjsp_mul_v2r8(iq1,jq1);
+    qq12             = _fjsp_mul_v2r8(iq1,jq2);
+    qq13             = _fjsp_mul_v2r8(iq1,jq3);
+    qq21             = _fjsp_mul_v2r8(iq2,jq1);
+    qq22             = _fjsp_mul_v2r8(iq2,jq2);
+    qq23             = _fjsp_mul_v2r8(iq2,jq3);
+    qq31             = _fjsp_mul_v2r8(iq3,jq1);
+    qq32             = _fjsp_mul_v2r8(iq3,jq2);
+    qq33             = _fjsp_mul_v2r8(iq3,jq3);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_fjsp_load_shift_and_4rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
+
+        fix0             = _fjsp_setzero_v2r8();
+        fiy0             = _fjsp_setzero_v2r8();
+        fiz0             = _fjsp_setzero_v2r8();
+        fix1             = _fjsp_setzero_v2r8();
+        fiy1             = _fjsp_setzero_v2r8();
+        fiz1             = _fjsp_setzero_v2r8();
+        fix2             = _fjsp_setzero_v2r8();
+        fiy2             = _fjsp_setzero_v2r8();
+        fiz2             = _fjsp_setzero_v2r8();
+        fix3             = _fjsp_setzero_v2r8();
+        fiy3             = _fjsp_setzero_v2r8();
+        fiz3             = _fjsp_setzero_v2r8();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_4rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
+                                              &jy2,&jz2,&jx3,&jy3,&jz3);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx11             = _fjsp_sub_v2r8(ix1,jx1);
+            dy11             = _fjsp_sub_v2r8(iy1,jy1);
+            dz11             = _fjsp_sub_v2r8(iz1,jz1);
+            dx12             = _fjsp_sub_v2r8(ix1,jx2);
+            dy12             = _fjsp_sub_v2r8(iy1,jy2);
+            dz12             = _fjsp_sub_v2r8(iz1,jz2);
+            dx13             = _fjsp_sub_v2r8(ix1,jx3);
+            dy13             = _fjsp_sub_v2r8(iy1,jy3);
+            dz13             = _fjsp_sub_v2r8(iz1,jz3);
+            dx21             = _fjsp_sub_v2r8(ix2,jx1);
+            dy21             = _fjsp_sub_v2r8(iy2,jy1);
+            dz21             = _fjsp_sub_v2r8(iz2,jz1);
+            dx22             = _fjsp_sub_v2r8(ix2,jx2);
+            dy22             = _fjsp_sub_v2r8(iy2,jy2);
+            dz22             = _fjsp_sub_v2r8(iz2,jz2);
+            dx23             = _fjsp_sub_v2r8(ix2,jx3);
+            dy23             = _fjsp_sub_v2r8(iy2,jy3);
+            dz23             = _fjsp_sub_v2r8(iz2,jz3);
+            dx31             = _fjsp_sub_v2r8(ix3,jx1);
+            dy31             = _fjsp_sub_v2r8(iy3,jy1);
+            dz31             = _fjsp_sub_v2r8(iz3,jz1);
+            dx32             = _fjsp_sub_v2r8(ix3,jx2);
+            dy32             = _fjsp_sub_v2r8(iy3,jy2);
+            dz32             = _fjsp_sub_v2r8(iz3,jz2);
+            dx33             = _fjsp_sub_v2r8(ix3,jx3);
+            dy33             = _fjsp_sub_v2r8(iy3,jy3);
+            dz33             = _fjsp_sub_v2r8(iz3,jz3);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+            rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+            rsq13            = gmx_fjsp_calc_rsq_v2r8(dx13,dy13,dz13);
+            rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+            rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+            rsq23            = gmx_fjsp_calc_rsq_v2r8(dx23,dy23,dz23);
+            rsq31            = gmx_fjsp_calc_rsq_v2r8(dx31,dy31,dz31);
+            rsq32            = gmx_fjsp_calc_rsq_v2r8(dx32,dy32,dz32);
+            rsq33            = gmx_fjsp_calc_rsq_v2r8(dx33,dy33,dz33);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+            rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+            rinv13           = gmx_fjsp_invsqrt_v2r8(rsq13);
+            rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+            rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+            rinv23           = gmx_fjsp_invsqrt_v2r8(rsq23);
+            rinv31           = gmx_fjsp_invsqrt_v2r8(rsq31);
+            rinv32           = gmx_fjsp_invsqrt_v2r8(rsq32);
+            rinv33           = gmx_fjsp_invsqrt_v2r8(rsq33);
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+            fjx1             = _fjsp_setzero_v2r8();
+            fjy1             = _fjsp_setzero_v2r8();
+            fjz1             = _fjsp_setzero_v2r8();
+            fjx2             = _fjsp_setzero_v2r8();
+            fjy2             = _fjsp_setzero_v2r8();
+            fjz2             = _fjsp_setzero_v2r8();
+            fjx3             = _fjsp_setzero_v2r8();
+            fjy3             = _fjsp_setzero_v2r8();
+            fjz3             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r00,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 12;
+            vfconv.i[1]     *= 12;
+
+            /* CUBIC SPLINE TABLE DISPERSION */
+            vfconv.i[0]       += 4;
+            vfconv.i[1]       += 4;
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 2 );
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 2 );
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+            FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+            fvdw6            = _fjsp_mul_v2r8(c6_00,FF);
+
+            /* CUBIC SPLINE TABLE REPULSION */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 4 );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 4 );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 6 );
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 6 );
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+            FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+            fvdw12           = _fjsp_mul_v2r8(c12_00,FF);
+            fvdw             = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_add_v2r8(fvdw6,fvdw12),_fjsp_mul_v2r8(vftabscale,rinv00)));
+
+            fscal            = fvdw;
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r11              = _fjsp_mul_v2r8(rsq11,rinv11);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r11,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 12;
+            vfconv.i[1]     *= 12;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq11,FF),_fjsp_mul_v2r8(vftabscale,rinv11)));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+            
+            fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r12              = _fjsp_mul_v2r8(rsq12,rinv12);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r12,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 12;
+            vfconv.i[1]     *= 12;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq12,FF),_fjsp_mul_v2r8(vftabscale,rinv12)));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+            
+            fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r13              = _fjsp_mul_v2r8(rsq13,rinv13);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r13,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 12;
+            vfconv.i[1]     *= 12;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq13,FF),_fjsp_mul_v2r8(vftabscale,rinv13)));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx13,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy13,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz13,fscal,fiz1);
+            
+            fjx3             = _fjsp_madd_v2r8(dx13,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy13,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz13,fscal,fjz3);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r21              = _fjsp_mul_v2r8(rsq21,rinv21);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r21,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 12;
+            vfconv.i[1]     *= 12;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq21,FF),_fjsp_mul_v2r8(vftabscale,rinv21)));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+            
+            fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r22              = _fjsp_mul_v2r8(rsq22,rinv22);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r22,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 12;
+            vfconv.i[1]     *= 12;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq22,FF),_fjsp_mul_v2r8(vftabscale,rinv22)));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+            
+            fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r23              = _fjsp_mul_v2r8(rsq23,rinv23);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r23,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 12;
+            vfconv.i[1]     *= 12;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq23,FF),_fjsp_mul_v2r8(vftabscale,rinv23)));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx23,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy23,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz23,fscal,fiz2);
+            
+            fjx3             = _fjsp_madd_v2r8(dx23,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy23,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz23,fscal,fjz3);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r31              = _fjsp_mul_v2r8(rsq31,rinv31);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r31,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 12;
+            vfconv.i[1]     *= 12;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq31,FF),_fjsp_mul_v2r8(vftabscale,rinv31)));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx31,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy31,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz31,fscal,fiz3);
+            
+            fjx1             = _fjsp_madd_v2r8(dx31,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy31,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz31,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r32              = _fjsp_mul_v2r8(rsq32,rinv32);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r32,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 12;
+            vfconv.i[1]     *= 12;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq32,FF),_fjsp_mul_v2r8(vftabscale,rinv32)));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx32,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy32,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz32,fscal,fiz3);
+            
+            fjx2             = _fjsp_madd_v2r8(dx32,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy32,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz32,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r33              = _fjsp_mul_v2r8(rsq33,rinv33);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r33,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 12;
+            vfconv.i[1]     *= 12;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq33,FF),_fjsp_mul_v2r8(vftabscale,rinv33)));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx33,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy33,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz33,fscal,fiz3);
+            
+            fjx3             = _fjsp_madd_v2r8(dx33,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy33,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz33,fscal,fjz3);
+
+            gmx_fjsp_decrement_4rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+
+            /* Inner loop uses 432 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_4rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
+                                              &jy2,&jz2,&jx3,&jy3,&jz3);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx11             = _fjsp_sub_v2r8(ix1,jx1);
+            dy11             = _fjsp_sub_v2r8(iy1,jy1);
+            dz11             = _fjsp_sub_v2r8(iz1,jz1);
+            dx12             = _fjsp_sub_v2r8(ix1,jx2);
+            dy12             = _fjsp_sub_v2r8(iy1,jy2);
+            dz12             = _fjsp_sub_v2r8(iz1,jz2);
+            dx13             = _fjsp_sub_v2r8(ix1,jx3);
+            dy13             = _fjsp_sub_v2r8(iy1,jy3);
+            dz13             = _fjsp_sub_v2r8(iz1,jz3);
+            dx21             = _fjsp_sub_v2r8(ix2,jx1);
+            dy21             = _fjsp_sub_v2r8(iy2,jy1);
+            dz21             = _fjsp_sub_v2r8(iz2,jz1);
+            dx22             = _fjsp_sub_v2r8(ix2,jx2);
+            dy22             = _fjsp_sub_v2r8(iy2,jy2);
+            dz22             = _fjsp_sub_v2r8(iz2,jz2);
+            dx23             = _fjsp_sub_v2r8(ix2,jx3);
+            dy23             = _fjsp_sub_v2r8(iy2,jy3);
+            dz23             = _fjsp_sub_v2r8(iz2,jz3);
+            dx31             = _fjsp_sub_v2r8(ix3,jx1);
+            dy31             = _fjsp_sub_v2r8(iy3,jy1);
+            dz31             = _fjsp_sub_v2r8(iz3,jz1);
+            dx32             = _fjsp_sub_v2r8(ix3,jx2);
+            dy32             = _fjsp_sub_v2r8(iy3,jy2);
+            dz32             = _fjsp_sub_v2r8(iz3,jz2);
+            dx33             = _fjsp_sub_v2r8(ix3,jx3);
+            dy33             = _fjsp_sub_v2r8(iy3,jy3);
+            dz33             = _fjsp_sub_v2r8(iz3,jz3);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+            rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+            rsq13            = gmx_fjsp_calc_rsq_v2r8(dx13,dy13,dz13);
+            rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+            rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+            rsq23            = gmx_fjsp_calc_rsq_v2r8(dx23,dy23,dz23);
+            rsq31            = gmx_fjsp_calc_rsq_v2r8(dx31,dy31,dz31);
+            rsq32            = gmx_fjsp_calc_rsq_v2r8(dx32,dy32,dz32);
+            rsq33            = gmx_fjsp_calc_rsq_v2r8(dx33,dy33,dz33);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+            rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+            rinv13           = gmx_fjsp_invsqrt_v2r8(rsq13);
+            rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+            rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+            rinv23           = gmx_fjsp_invsqrt_v2r8(rsq23);
+            rinv31           = gmx_fjsp_invsqrt_v2r8(rsq31);
+            rinv32           = gmx_fjsp_invsqrt_v2r8(rsq32);
+            rinv33           = gmx_fjsp_invsqrt_v2r8(rsq33);
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+            fjx1             = _fjsp_setzero_v2r8();
+            fjy1             = _fjsp_setzero_v2r8();
+            fjz1             = _fjsp_setzero_v2r8();
+            fjx2             = _fjsp_setzero_v2r8();
+            fjy2             = _fjsp_setzero_v2r8();
+            fjz2             = _fjsp_setzero_v2r8();
+            fjx3             = _fjsp_setzero_v2r8();
+            fjy3             = _fjsp_setzero_v2r8();
+            fjz3             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r00,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 12;
+            vfconv.i[1]     *= 12;
+
+            /* CUBIC SPLINE TABLE DISPERSION */
+            vfconv.i[0]       += 4;
+            vfconv.i[1]       += 4;
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 2 );
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+            FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+            fvdw6            = _fjsp_mul_v2r8(c6_00,FF);
+
+            /* CUBIC SPLINE TABLE REPULSION */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 4 );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 6 );
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+            FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+            fvdw12           = _fjsp_mul_v2r8(c12_00,FF);
+            fvdw             = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_add_v2r8(fvdw6,fvdw12),_fjsp_mul_v2r8(vftabscale,rinv00)));
+
+            fscal            = fvdw;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r11              = _fjsp_mul_v2r8(rsq11,rinv11);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r11,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 12;
+            vfconv.i[1]     *= 12;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq11,FF),_fjsp_mul_v2r8(vftabscale,rinv11)));
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+            
+            fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r12              = _fjsp_mul_v2r8(rsq12,rinv12);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r12,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 12;
+            vfconv.i[1]     *= 12;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq12,FF),_fjsp_mul_v2r8(vftabscale,rinv12)));
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+            
+            fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r13              = _fjsp_mul_v2r8(rsq13,rinv13);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r13,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 12;
+            vfconv.i[1]     *= 12;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq13,FF),_fjsp_mul_v2r8(vftabscale,rinv13)));
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx13,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy13,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz13,fscal,fiz1);
+            
+            fjx3             = _fjsp_madd_v2r8(dx13,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy13,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz13,fscal,fjz3);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r21              = _fjsp_mul_v2r8(rsq21,rinv21);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r21,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 12;
+            vfconv.i[1]     *= 12;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq21,FF),_fjsp_mul_v2r8(vftabscale,rinv21)));
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+            
+            fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r22              = _fjsp_mul_v2r8(rsq22,rinv22);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r22,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 12;
+            vfconv.i[1]     *= 12;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq22,FF),_fjsp_mul_v2r8(vftabscale,rinv22)));
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+            
+            fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r23              = _fjsp_mul_v2r8(rsq23,rinv23);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r23,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 12;
+            vfconv.i[1]     *= 12;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq23,FF),_fjsp_mul_v2r8(vftabscale,rinv23)));
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx23,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy23,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz23,fscal,fiz2);
+            
+            fjx3             = _fjsp_madd_v2r8(dx23,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy23,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz23,fscal,fjz3);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r31              = _fjsp_mul_v2r8(rsq31,rinv31);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r31,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 12;
+            vfconv.i[1]     *= 12;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq31,FF),_fjsp_mul_v2r8(vftabscale,rinv31)));
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx31,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy31,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz31,fscal,fiz3);
+            
+            fjx1             = _fjsp_madd_v2r8(dx31,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy31,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz31,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r32              = _fjsp_mul_v2r8(rsq32,rinv32);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r32,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 12;
+            vfconv.i[1]     *= 12;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq32,FF),_fjsp_mul_v2r8(vftabscale,rinv32)));
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx32,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy32,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz32,fscal,fiz3);
+            
+            fjx2             = _fjsp_madd_v2r8(dx32,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy32,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz32,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r33              = _fjsp_mul_v2r8(rsq33,rinv33);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r33,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 12;
+            vfconv.i[1]     *= 12;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq33,FF),_fjsp_mul_v2r8(vftabscale,rinv33)));
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx33,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy33,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz33,fscal,fiz3);
+            
+            fjx3             = _fjsp_madd_v2r8(dx33,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy33,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz33,fscal,fjz3);
+
+            gmx_fjsp_decrement_4rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+
+            /* Inner loop uses 432 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_fjsp_update_iforce_4atom_swizzle_v2r8(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 24 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_F,outeriter*24 + inneriter*432);
+}
diff --git a/src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecCSTab_VdwLJ_GeomP1P1_sparc64_hpc_ace_double.c b/src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecCSTab_VdwLJ_GeomP1P1_sparc64_hpc_ace_double.c
new file mode 100644 (file)
index 0000000..c78c355
--- /dev/null
@@ -0,0 +1,635 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2012, by the GROMACS development team, led by
+ * David van der Spoel, Berk Hess, Erik Lindahl, and including many
+ * others, as listed in the AUTHORS file in the top-level source
+ * directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*
+ * Note: this file was generated by the GROMACS sparc64_hpc_ace_double kernel generator.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "kernelutil_sparc64_hpc_ace_double.h"
+
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecCSTab_VdwLJ_GeomP1P1_VF_sparc64_hpc_ace_double
+ * Electrostatics interaction: CubicSplineTable
+ * VdW interaction:            LennardJones
+ * Geometry:                   Particle-Particle
+ * Calculate force/pot:        PotentialAndForce
+ */
+void
+nb_kernel_ElecCSTab_VdwLJ_GeomP1P1_VF_sparc64_hpc_ace_double
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwjidx0A,vdwjidx0B;
+    _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
+    _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
+    _fjsp_v2r8       rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF,twovfeps;
+    real             *vftab;
+    _fjsp_v2r8       itab_tmp;
+    _fjsp_v2r8       dummy_mask,cutoff_mask;
+    _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+    _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+    union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+
+    vftab            = kernel_data->table_elec->data;
+    vftabscale       = gmx_fjsp_set1_v2r8(kernel_data->table_elec->scale);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_fjsp_load_shift_and_1rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,&ix0,&iy0,&iz0);
+
+        fix0             = _fjsp_setzero_v2r8();
+        fiy0             = _fjsp_setzero_v2r8();
+        fiz0             = _fjsp_setzero_v2r8();
+
+        /* Load parameters for i particles */
+        iq0              = _fjsp_mul_v2r8(facel,gmx_fjsp_load1_v2r8(charge+inr+0));
+        vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+        /* Reset potential sums */
+        velecsum         = _fjsp_setzero_v2r8();
+        vvdwsum          = _fjsp_setzero_v2r8();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_fjsp_load_2real_swizzle_v2r8(charge+jnrA+0,charge+jnrB+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _fjsp_mul_v2r8(iq0,jq0);
+            gmx_fjsp_load_2pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,&c6_00,&c12_00);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r00,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 4;
+            vfconv.i[1]     *= 4;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            velec            = _fjsp_mul_v2r8(qq00,VV);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq00,FF),_fjsp_mul_v2r8(vftabscale,rinv00)));
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+            vvdw6            = _fjsp_mul_v2r8(c6_00,rinvsix);
+            vvdw12           = _fjsp_mul_v2r8(c12_00,_fjsp_mul_v2r8(rinvsix,rinvsix));
+            vvdw             = _fjsp_msub_v2r8( vvdw12,one_twelfth, _fjsp_mul_v2r8(vvdw6,one_sixth) );
+            fvdw             = _fjsp_mul_v2r8(_fjsp_sub_v2r8(vvdw12,vvdw6),rinvsq00);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+            vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
+
+            fscal            = _fjsp_add_v2r8(felec,fvdw);
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            gmx_fjsp_decrement_fma_1rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fscal,dx00,dy00,dz00);
+
+            /* Inner loop uses 59 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+
+            /* Load parameters for j particles */
+            jq0              = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),charge+jnrA+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _fjsp_mul_v2r8(iq0,jq0);
+            gmx_fjsp_load_1pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,&c6_00,&c12_00);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r00,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 4;
+            vfconv.i[1]     *= 4;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            velec            = _fjsp_mul_v2r8(qq00,VV);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq00,FF),_fjsp_mul_v2r8(vftabscale,rinv00)));
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+            vvdw6            = _fjsp_mul_v2r8(c6_00,rinvsix);
+            vvdw12           = _fjsp_mul_v2r8(c12_00,_fjsp_mul_v2r8(rinvsix,rinvsix));
+            vvdw             = _fjsp_msub_v2r8( vvdw12,one_twelfth, _fjsp_mul_v2r8(vvdw6,one_sixth) );
+            fvdw             = _fjsp_mul_v2r8(_fjsp_sub_v2r8(vvdw12,vvdw6),rinvsq00);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+            vvdw             = _fjsp_unpacklo_v2r8(vvdw,_fjsp_setzero_v2r8());
+            vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
+
+            fscal            = _fjsp_add_v2r8(felec,fvdw);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            gmx_fjsp_decrement_fma_1rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fscal,dx00,dy00,dz00);
+
+            /* Inner loop uses 59 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_fjsp_update_iforce_1atom_swizzle_v2r8(fix0,fiy0,fiz0,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        ggid                        = gid[iidx];
+        /* Update potential energies */
+        gmx_fjsp_update_1pot_v2r8(velecsum,kernel_data->energygrp_elec+ggid);
+        gmx_fjsp_update_1pot_v2r8(vvdwsum,kernel_data->energygrp_vdw+ggid);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 9 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_VF,outeriter*9 + inneriter*59);
+}
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecCSTab_VdwLJ_GeomP1P1_F_sparc64_hpc_ace_double
+ * Electrostatics interaction: CubicSplineTable
+ * VdW interaction:            LennardJones
+ * Geometry:                   Particle-Particle
+ * Calculate force/pot:        Force
+ */
+void
+nb_kernel_ElecCSTab_VdwLJ_GeomP1P1_F_sparc64_hpc_ace_double
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwjidx0A,vdwjidx0B;
+    _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
+    _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
+    _fjsp_v2r8       rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF,twovfeps;
+    real             *vftab;
+    _fjsp_v2r8       itab_tmp;
+    _fjsp_v2r8       dummy_mask,cutoff_mask;
+    _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+    _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+    union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+
+    vftab            = kernel_data->table_elec->data;
+    vftabscale       = gmx_fjsp_set1_v2r8(kernel_data->table_elec->scale);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_fjsp_load_shift_and_1rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,&ix0,&iy0,&iz0);
+
+        fix0             = _fjsp_setzero_v2r8();
+        fiy0             = _fjsp_setzero_v2r8();
+        fiz0             = _fjsp_setzero_v2r8();
+
+        /* Load parameters for i particles */
+        iq0              = _fjsp_mul_v2r8(facel,gmx_fjsp_load1_v2r8(charge+inr+0));
+        vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_fjsp_load_2real_swizzle_v2r8(charge+jnrA+0,charge+jnrB+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _fjsp_mul_v2r8(iq0,jq0);
+            gmx_fjsp_load_2pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,&c6_00,&c12_00);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r00,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 4;
+            vfconv.i[1]     *= 4;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq00,FF),_fjsp_mul_v2r8(vftabscale,rinv00)));
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+            fvdw             = _fjsp_mul_v2r8(_fjsp_msub_v2r8(c12_00,rinvsix,c6_00),_fjsp_mul_v2r8(rinvsix,rinvsq00));
+
+            fscal            = _fjsp_add_v2r8(felec,fvdw);
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            gmx_fjsp_decrement_fma_1rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fscal,dx00,dy00,dz00);
+
+            /* Inner loop uses 50 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+
+            /* Load parameters for j particles */
+            jq0              = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),charge+jnrA+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _fjsp_mul_v2r8(iq0,jq0);
+            gmx_fjsp_load_1pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,&c6_00,&c12_00);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r00,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 4;
+            vfconv.i[1]     *= 4;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq00,FF),_fjsp_mul_v2r8(vftabscale,rinv00)));
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+            fvdw             = _fjsp_mul_v2r8(_fjsp_msub_v2r8(c12_00,rinvsix,c6_00),_fjsp_mul_v2r8(rinvsix,rinvsq00));
+
+            fscal            = _fjsp_add_v2r8(felec,fvdw);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            gmx_fjsp_decrement_fma_1rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fscal,dx00,dy00,dz00);
+
+            /* Inner loop uses 50 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_fjsp_update_iforce_1atom_swizzle_v2r8(fix0,fiy0,fiz0,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 7 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_F,outeriter*7 + inneriter*50);
+}
diff --git a/src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecCSTab_VdwLJ_GeomW3P1_sparc64_hpc_ace_double.c b/src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecCSTab_VdwLJ_GeomW3P1_sparc64_hpc_ace_double.c
new file mode 100644 (file)
index 0000000..602513b
--- /dev/null
@@ -0,0 +1,1097 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2012, by the GROMACS development team, led by
+ * David van der Spoel, Berk Hess, Erik Lindahl, and including many
+ * others, as listed in the AUTHORS file in the top-level source
+ * directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*
+ * Note: this file was generated by the GROMACS sparc64_hpc_ace_double kernel generator.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "kernelutil_sparc64_hpc_ace_double.h"
+
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecCSTab_VdwLJ_GeomW3P1_VF_sparc64_hpc_ace_double
+ * Electrostatics interaction: CubicSplineTable
+ * VdW interaction:            LennardJones
+ * Geometry:                   Water3-Particle
+ * Calculate force/pot:        PotentialAndForce
+ */
+void
+nb_kernel_ElecCSTab_VdwLJ_GeomW3P1_VF_sparc64_hpc_ace_double
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwjidx0A,vdwjidx0B;
+    _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    _fjsp_v2r8       dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    _fjsp_v2r8       dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
+    _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
+    _fjsp_v2r8       rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF,twovfeps;
+    real             *vftab;
+    _fjsp_v2r8       itab_tmp;
+    _fjsp_v2r8       dummy_mask,cutoff_mask;
+    _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+    _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+    union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+
+    vftab            = kernel_data->table_elec->data;
+    vftabscale       = gmx_fjsp_set1_v2r8(kernel_data->table_elec->scale);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq0              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+0]));
+    iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+    iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+    vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_fjsp_load_shift_and_3rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
+
+        fix0             = _fjsp_setzero_v2r8();
+        fiy0             = _fjsp_setzero_v2r8();
+        fiz0             = _fjsp_setzero_v2r8();
+        fix1             = _fjsp_setzero_v2r8();
+        fiy1             = _fjsp_setzero_v2r8();
+        fiz1             = _fjsp_setzero_v2r8();
+        fix2             = _fjsp_setzero_v2r8();
+        fiy2             = _fjsp_setzero_v2r8();
+        fiz2             = _fjsp_setzero_v2r8();
+
+        /* Reset potential sums */
+        velecsum         = _fjsp_setzero_v2r8();
+        vvdwsum          = _fjsp_setzero_v2r8();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_fjsp_load_2real_swizzle_v2r8(charge+jnrA+0,charge+jnrB+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _fjsp_mul_v2r8(iq0,jq0);
+            gmx_fjsp_load_2pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,&c6_00,&c12_00);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r00,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 4;
+            vfconv.i[1]     *= 4;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            velec            = _fjsp_mul_v2r8(qq00,VV);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq00,FF),_fjsp_mul_v2r8(vftabscale,rinv00)));
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+            vvdw6            = _fjsp_mul_v2r8(c6_00,rinvsix);
+            vvdw12           = _fjsp_mul_v2r8(c12_00,_fjsp_mul_v2r8(rinvsix,rinvsix));
+            vvdw             = _fjsp_msub_v2r8( vvdw12,one_twelfth, _fjsp_mul_v2r8(vvdw6,one_sixth) );
+            fvdw             = _fjsp_mul_v2r8(_fjsp_sub_v2r8(vvdw12,vvdw6),rinvsq00);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+            vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
+
+            fscal            = _fjsp_add_v2r8(felec,fvdw);
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r10              = _fjsp_mul_v2r8(rsq10,rinv10);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _fjsp_mul_v2r8(iq1,jq0);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r10,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 4;
+            vfconv.i[1]     *= 4;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            velec            = _fjsp_mul_v2r8(qq10,VV);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq10,FF),_fjsp_mul_v2r8(vftabscale,rinv10)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r20              = _fjsp_mul_v2r8(rsq20,rinv20);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _fjsp_mul_v2r8(iq2,jq0);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r20,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 4;
+            vfconv.i[1]     *= 4;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            velec            = _fjsp_mul_v2r8(qq20,VV);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq20,FF),_fjsp_mul_v2r8(vftabscale,rinv20)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            gmx_fjsp_decrement_1rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 154 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+
+            /* Load parameters for j particles */
+            jq0              = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),charge+jnrA+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _fjsp_mul_v2r8(iq0,jq0);
+            gmx_fjsp_load_1pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,&c6_00,&c12_00);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r00,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 4;
+            vfconv.i[1]     *= 4;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            velec            = _fjsp_mul_v2r8(qq00,VV);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq00,FF),_fjsp_mul_v2r8(vftabscale,rinv00)));
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+            vvdw6            = _fjsp_mul_v2r8(c6_00,rinvsix);
+            vvdw12           = _fjsp_mul_v2r8(c12_00,_fjsp_mul_v2r8(rinvsix,rinvsix));
+            vvdw             = _fjsp_msub_v2r8( vvdw12,one_twelfth, _fjsp_mul_v2r8(vvdw6,one_sixth) );
+            fvdw             = _fjsp_mul_v2r8(_fjsp_sub_v2r8(vvdw12,vvdw6),rinvsq00);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+            vvdw             = _fjsp_unpacklo_v2r8(vvdw,_fjsp_setzero_v2r8());
+            vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
+
+            fscal            = _fjsp_add_v2r8(felec,fvdw);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r10              = _fjsp_mul_v2r8(rsq10,rinv10);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _fjsp_mul_v2r8(iq1,jq0);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r10,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 4;
+            vfconv.i[1]     *= 4;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            velec            = _fjsp_mul_v2r8(qq10,VV);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq10,FF),_fjsp_mul_v2r8(vftabscale,rinv10)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r20              = _fjsp_mul_v2r8(rsq20,rinv20);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _fjsp_mul_v2r8(iq2,jq0);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r20,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 4;
+            vfconv.i[1]     *= 4;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            velec            = _fjsp_mul_v2r8(qq20,VV);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq20,FF),_fjsp_mul_v2r8(vftabscale,rinv20)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            gmx_fjsp_decrement_1rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 154 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_fjsp_update_iforce_3atom_swizzle_v2r8(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        ggid                        = gid[iidx];
+        /* Update potential energies */
+        gmx_fjsp_update_1pot_v2r8(velecsum,kernel_data->energygrp_elec+ggid);
+        gmx_fjsp_update_1pot_v2r8(vvdwsum,kernel_data->energygrp_vdw+ggid);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 20 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3_VF,outeriter*20 + inneriter*154);
+}
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecCSTab_VdwLJ_GeomW3P1_F_sparc64_hpc_ace_double
+ * Electrostatics interaction: CubicSplineTable
+ * VdW interaction:            LennardJones
+ * Geometry:                   Water3-Particle
+ * Calculate force/pot:        Force
+ */
+void
+nb_kernel_ElecCSTab_VdwLJ_GeomW3P1_F_sparc64_hpc_ace_double
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwjidx0A,vdwjidx0B;
+    _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    _fjsp_v2r8       dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    _fjsp_v2r8       dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
+    _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
+    _fjsp_v2r8       rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF,twovfeps;
+    real             *vftab;
+    _fjsp_v2r8       itab_tmp;
+    _fjsp_v2r8       dummy_mask,cutoff_mask;
+    _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+    _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+    union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+
+    vftab            = kernel_data->table_elec->data;
+    vftabscale       = gmx_fjsp_set1_v2r8(kernel_data->table_elec->scale);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq0              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+0]));
+    iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+    iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+    vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_fjsp_load_shift_and_3rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
+
+        fix0             = _fjsp_setzero_v2r8();
+        fiy0             = _fjsp_setzero_v2r8();
+        fiz0             = _fjsp_setzero_v2r8();
+        fix1             = _fjsp_setzero_v2r8();
+        fiy1             = _fjsp_setzero_v2r8();
+        fiz1             = _fjsp_setzero_v2r8();
+        fix2             = _fjsp_setzero_v2r8();
+        fiy2             = _fjsp_setzero_v2r8();
+        fiz2             = _fjsp_setzero_v2r8();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_fjsp_load_2real_swizzle_v2r8(charge+jnrA+0,charge+jnrB+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _fjsp_mul_v2r8(iq0,jq0);
+            gmx_fjsp_load_2pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,&c6_00,&c12_00);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r00,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 4;
+            vfconv.i[1]     *= 4;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq00,FF),_fjsp_mul_v2r8(vftabscale,rinv00)));
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+            fvdw             = _fjsp_mul_v2r8(_fjsp_msub_v2r8(c12_00,rinvsix,c6_00),_fjsp_mul_v2r8(rinvsix,rinvsq00));
+
+            fscal            = _fjsp_add_v2r8(felec,fvdw);
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r10              = _fjsp_mul_v2r8(rsq10,rinv10);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _fjsp_mul_v2r8(iq1,jq0);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r10,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 4;
+            vfconv.i[1]     *= 4;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq10,FF),_fjsp_mul_v2r8(vftabscale,rinv10)));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r20              = _fjsp_mul_v2r8(rsq20,rinv20);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _fjsp_mul_v2r8(iq2,jq0);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r20,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 4;
+            vfconv.i[1]     *= 4;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq20,FF),_fjsp_mul_v2r8(vftabscale,rinv20)));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            gmx_fjsp_decrement_1rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 137 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+
+            /* Load parameters for j particles */
+            jq0              = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),charge+jnrA+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _fjsp_mul_v2r8(iq0,jq0);
+            gmx_fjsp_load_1pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,&c6_00,&c12_00);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r00,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 4;
+            vfconv.i[1]     *= 4;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq00,FF),_fjsp_mul_v2r8(vftabscale,rinv00)));
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+            fvdw             = _fjsp_mul_v2r8(_fjsp_msub_v2r8(c12_00,rinvsix,c6_00),_fjsp_mul_v2r8(rinvsix,rinvsq00));
+
+            fscal            = _fjsp_add_v2r8(felec,fvdw);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r10              = _fjsp_mul_v2r8(rsq10,rinv10);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _fjsp_mul_v2r8(iq1,jq0);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r10,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 4;
+            vfconv.i[1]     *= 4;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq10,FF),_fjsp_mul_v2r8(vftabscale,rinv10)));
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r20              = _fjsp_mul_v2r8(rsq20,rinv20);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _fjsp_mul_v2r8(iq2,jq0);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r20,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 4;
+            vfconv.i[1]     *= 4;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq20,FF),_fjsp_mul_v2r8(vftabscale,rinv20)));
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            gmx_fjsp_decrement_1rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 137 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_fjsp_update_iforce_3atom_swizzle_v2r8(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 18 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3_F,outeriter*18 + inneriter*137);
+}
diff --git a/src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecCSTab_VdwLJ_GeomW3W3_sparc64_hpc_ace_double.c b/src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecCSTab_VdwLJ_GeomW3W3_sparc64_hpc_ace_double.c
new file mode 100644 (file)
index 0000000..18e4b9a
--- /dev/null
@@ -0,0 +1,2235 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2012, by the GROMACS development team, led by
+ * David van der Spoel, Berk Hess, Erik Lindahl, and including many
+ * others, as listed in the AUTHORS file in the top-level source
+ * directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*
+ * Note: this file was generated by the GROMACS sparc64_hpc_ace_double kernel generator.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "kernelutil_sparc64_hpc_ace_double.h"
+
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecCSTab_VdwLJ_GeomW3W3_VF_sparc64_hpc_ace_double
+ * Electrostatics interaction: CubicSplineTable
+ * VdW interaction:            LennardJones
+ * Geometry:                   Water3-Water3
+ * Calculate force/pot:        PotentialAndForce
+ */
+void
+nb_kernel_ElecCSTab_VdwLJ_GeomW3W3_VF_sparc64_hpc_ace_double
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwjidx0A,vdwjidx0B;
+    _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    int              vdwjidx1A,vdwjidx1B;
+    _fjsp_v2r8       jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+    int              vdwjidx2A,vdwjidx2B;
+    _fjsp_v2r8       jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+    _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    _fjsp_v2r8       dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
+    _fjsp_v2r8       dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
+    _fjsp_v2r8       dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    _fjsp_v2r8       dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+    _fjsp_v2r8       dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+    _fjsp_v2r8       dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    _fjsp_v2r8       dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+    _fjsp_v2r8       dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+    _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
+    _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
+    _fjsp_v2r8       rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF,twovfeps;
+    real             *vftab;
+    _fjsp_v2r8       itab_tmp;
+    _fjsp_v2r8       dummy_mask,cutoff_mask;
+    _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+    _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+    union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+
+    vftab            = kernel_data->table_elec->data;
+    vftabscale       = gmx_fjsp_set1_v2r8(kernel_data->table_elec->scale);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq0              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+0]));
+    iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+    iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+    vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+    jq0              = gmx_fjsp_set1_v2r8(charge[inr+0]);
+    jq1              = gmx_fjsp_set1_v2r8(charge[inr+1]);
+    jq2              = gmx_fjsp_set1_v2r8(charge[inr+2]);
+    vdwjidx0A        = 2*vdwtype[inr+0];
+    qq00             = _fjsp_mul_v2r8(iq0,jq0);
+    c6_00            = gmx_fjsp_set1_v2r8(vdwparam[vdwioffset0+vdwjidx0A]);
+    c12_00           = gmx_fjsp_set1_v2r8(vdwparam[vdwioffset0+vdwjidx0A+1]);
+    qq01             = _fjsp_mul_v2r8(iq0,jq1);
+    qq02             = _fjsp_mul_v2r8(iq0,jq2);
+    qq10             = _fjsp_mul_v2r8(iq1,jq0);
+    qq11             = _fjsp_mul_v2r8(iq1,jq1);
+    qq12             = _fjsp_mul_v2r8(iq1,jq2);
+    qq20             = _fjsp_mul_v2r8(iq2,jq0);
+    qq21             = _fjsp_mul_v2r8(iq2,jq1);
+    qq22             = _fjsp_mul_v2r8(iq2,jq2);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_fjsp_load_shift_and_3rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
+
+        fix0             = _fjsp_setzero_v2r8();
+        fiy0             = _fjsp_setzero_v2r8();
+        fiz0             = _fjsp_setzero_v2r8();
+        fix1             = _fjsp_setzero_v2r8();
+        fiy1             = _fjsp_setzero_v2r8();
+        fiz1             = _fjsp_setzero_v2r8();
+        fix2             = _fjsp_setzero_v2r8();
+        fiy2             = _fjsp_setzero_v2r8();
+        fiz2             = _fjsp_setzero_v2r8();
+
+        /* Reset potential sums */
+        velecsum         = _fjsp_setzero_v2r8();
+        vvdwsum          = _fjsp_setzero_v2r8();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_3rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx01             = _fjsp_sub_v2r8(ix0,jx1);
+            dy01             = _fjsp_sub_v2r8(iy0,jy1);
+            dz01             = _fjsp_sub_v2r8(iz0,jz1);
+            dx02             = _fjsp_sub_v2r8(ix0,jx2);
+            dy02             = _fjsp_sub_v2r8(iy0,jy2);
+            dz02             = _fjsp_sub_v2r8(iz0,jz2);
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx11             = _fjsp_sub_v2r8(ix1,jx1);
+            dy11             = _fjsp_sub_v2r8(iy1,jy1);
+            dz11             = _fjsp_sub_v2r8(iz1,jz1);
+            dx12             = _fjsp_sub_v2r8(ix1,jx2);
+            dy12             = _fjsp_sub_v2r8(iy1,jy2);
+            dz12             = _fjsp_sub_v2r8(iz1,jz2);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+            dx21             = _fjsp_sub_v2r8(ix2,jx1);
+            dy21             = _fjsp_sub_v2r8(iy2,jy1);
+            dz21             = _fjsp_sub_v2r8(iz2,jz1);
+            dx22             = _fjsp_sub_v2r8(ix2,jx2);
+            dy22             = _fjsp_sub_v2r8(iy2,jy2);
+            dz22             = _fjsp_sub_v2r8(iz2,jz2);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq01            = gmx_fjsp_calc_rsq_v2r8(dx01,dy01,dz01);
+            rsq02            = gmx_fjsp_calc_rsq_v2r8(dx02,dy02,dz02);
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+            rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+            rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+            rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv01           = gmx_fjsp_invsqrt_v2r8(rsq01);
+            rinv02           = gmx_fjsp_invsqrt_v2r8(rsq02);
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+            rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+            rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+            rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+            fjx1             = _fjsp_setzero_v2r8();
+            fjy1             = _fjsp_setzero_v2r8();
+            fjz1             = _fjsp_setzero_v2r8();
+            fjx2             = _fjsp_setzero_v2r8();
+            fjy2             = _fjsp_setzero_v2r8();
+            fjz2             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r00,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 4;
+            vfconv.i[1]     *= 4;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            velec            = _fjsp_mul_v2r8(qq00,VV);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq00,FF),_fjsp_mul_v2r8(vftabscale,rinv00)));
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+            vvdw6            = _fjsp_mul_v2r8(c6_00,rinvsix);
+            vvdw12           = _fjsp_mul_v2r8(c12_00,_fjsp_mul_v2r8(rinvsix,rinvsix));
+            vvdw             = _fjsp_msub_v2r8( vvdw12,one_twelfth, _fjsp_mul_v2r8(vvdw6,one_sixth) );
+            fvdw             = _fjsp_mul_v2r8(_fjsp_sub_v2r8(vvdw12,vvdw6),rinvsq00);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+            vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
+
+            fscal            = _fjsp_add_v2r8(felec,fvdw);
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r01              = _fjsp_mul_v2r8(rsq01,rinv01);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r01,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 4;
+            vfconv.i[1]     *= 4;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            velec            = _fjsp_mul_v2r8(qq01,VV);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq01,FF),_fjsp_mul_v2r8(vftabscale,rinv01)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx01,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy01,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz01,fscal,fiz0);
+            
+            fjx1             = _fjsp_madd_v2r8(dx01,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy01,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz01,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r02              = _fjsp_mul_v2r8(rsq02,rinv02);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r02,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 4;
+            vfconv.i[1]     *= 4;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            velec            = _fjsp_mul_v2r8(qq02,VV);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq02,FF),_fjsp_mul_v2r8(vftabscale,rinv02)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx02,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy02,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz02,fscal,fiz0);
+            
+            fjx2             = _fjsp_madd_v2r8(dx02,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy02,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz02,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r10              = _fjsp_mul_v2r8(rsq10,rinv10);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r10,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 4;
+            vfconv.i[1]     *= 4;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            velec            = _fjsp_mul_v2r8(qq10,VV);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq10,FF),_fjsp_mul_v2r8(vftabscale,rinv10)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r11              = _fjsp_mul_v2r8(rsq11,rinv11);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r11,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 4;
+            vfconv.i[1]     *= 4;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            velec            = _fjsp_mul_v2r8(qq11,VV);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq11,FF),_fjsp_mul_v2r8(vftabscale,rinv11)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+            
+            fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r12              = _fjsp_mul_v2r8(rsq12,rinv12);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r12,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 4;
+            vfconv.i[1]     *= 4;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            velec            = _fjsp_mul_v2r8(qq12,VV);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq12,FF),_fjsp_mul_v2r8(vftabscale,rinv12)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+            
+            fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r20              = _fjsp_mul_v2r8(rsq20,rinv20);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r20,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 4;
+            vfconv.i[1]     *= 4;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            velec            = _fjsp_mul_v2r8(qq20,VV);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq20,FF),_fjsp_mul_v2r8(vftabscale,rinv20)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r21              = _fjsp_mul_v2r8(rsq21,rinv21);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r21,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 4;
+            vfconv.i[1]     *= 4;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            velec            = _fjsp_mul_v2r8(qq21,VV);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq21,FF),_fjsp_mul_v2r8(vftabscale,rinv21)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+            
+            fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r22              = _fjsp_mul_v2r8(rsq22,rinv22);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r22,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 4;
+            vfconv.i[1]     *= 4;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            velec            = _fjsp_mul_v2r8(qq22,VV);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq22,FF),_fjsp_mul_v2r8(vftabscale,rinv22)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+            
+            fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+
+            gmx_fjsp_decrement_3rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+
+            /* Inner loop uses 427 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_3rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx01             = _fjsp_sub_v2r8(ix0,jx1);
+            dy01             = _fjsp_sub_v2r8(iy0,jy1);
+            dz01             = _fjsp_sub_v2r8(iz0,jz1);
+            dx02             = _fjsp_sub_v2r8(ix0,jx2);
+            dy02             = _fjsp_sub_v2r8(iy0,jy2);
+            dz02             = _fjsp_sub_v2r8(iz0,jz2);
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx11             = _fjsp_sub_v2r8(ix1,jx1);
+            dy11             = _fjsp_sub_v2r8(iy1,jy1);
+            dz11             = _fjsp_sub_v2r8(iz1,jz1);
+            dx12             = _fjsp_sub_v2r8(ix1,jx2);
+            dy12             = _fjsp_sub_v2r8(iy1,jy2);
+            dz12             = _fjsp_sub_v2r8(iz1,jz2);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+            dx21             = _fjsp_sub_v2r8(ix2,jx1);
+            dy21             = _fjsp_sub_v2r8(iy2,jy1);
+            dz21             = _fjsp_sub_v2r8(iz2,jz1);
+            dx22             = _fjsp_sub_v2r8(ix2,jx2);
+            dy22             = _fjsp_sub_v2r8(iy2,jy2);
+            dz22             = _fjsp_sub_v2r8(iz2,jz2);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq01            = gmx_fjsp_calc_rsq_v2r8(dx01,dy01,dz01);
+            rsq02            = gmx_fjsp_calc_rsq_v2r8(dx02,dy02,dz02);
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+            rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+            rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+            rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv01           = gmx_fjsp_invsqrt_v2r8(rsq01);
+            rinv02           = gmx_fjsp_invsqrt_v2r8(rsq02);
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+            rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+            rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+            rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+            fjx1             = _fjsp_setzero_v2r8();
+            fjy1             = _fjsp_setzero_v2r8();
+            fjz1             = _fjsp_setzero_v2r8();
+            fjx2             = _fjsp_setzero_v2r8();
+            fjy2             = _fjsp_setzero_v2r8();
+            fjz2             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r00,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 4;
+            vfconv.i[1]     *= 4;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            velec            = _fjsp_mul_v2r8(qq00,VV);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq00,FF),_fjsp_mul_v2r8(vftabscale,rinv00)));
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+            vvdw6            = _fjsp_mul_v2r8(c6_00,rinvsix);
+            vvdw12           = _fjsp_mul_v2r8(c12_00,_fjsp_mul_v2r8(rinvsix,rinvsix));
+            vvdw             = _fjsp_msub_v2r8( vvdw12,one_twelfth, _fjsp_mul_v2r8(vvdw6,one_sixth) );
+            fvdw             = _fjsp_mul_v2r8(_fjsp_sub_v2r8(vvdw12,vvdw6),rinvsq00);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+            vvdw             = _fjsp_unpacklo_v2r8(vvdw,_fjsp_setzero_v2r8());
+            vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
+
+            fscal            = _fjsp_add_v2r8(felec,fvdw);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r01              = _fjsp_mul_v2r8(rsq01,rinv01);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r01,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 4;
+            vfconv.i[1]     *= 4;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            velec            = _fjsp_mul_v2r8(qq01,VV);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq01,FF),_fjsp_mul_v2r8(vftabscale,rinv01)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx01,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy01,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz01,fscal,fiz0);
+            
+            fjx1             = _fjsp_madd_v2r8(dx01,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy01,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz01,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r02              = _fjsp_mul_v2r8(rsq02,rinv02);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r02,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 4;
+            vfconv.i[1]     *= 4;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            velec            = _fjsp_mul_v2r8(qq02,VV);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq02,FF),_fjsp_mul_v2r8(vftabscale,rinv02)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx02,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy02,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz02,fscal,fiz0);
+            
+            fjx2             = _fjsp_madd_v2r8(dx02,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy02,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz02,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r10              = _fjsp_mul_v2r8(rsq10,rinv10);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r10,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 4;
+            vfconv.i[1]     *= 4;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            velec            = _fjsp_mul_v2r8(qq10,VV);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq10,FF),_fjsp_mul_v2r8(vftabscale,rinv10)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r11              = _fjsp_mul_v2r8(rsq11,rinv11);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r11,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 4;
+            vfconv.i[1]     *= 4;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            velec            = _fjsp_mul_v2r8(qq11,VV);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq11,FF),_fjsp_mul_v2r8(vftabscale,rinv11)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+            
+            fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r12              = _fjsp_mul_v2r8(rsq12,rinv12);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r12,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 4;
+            vfconv.i[1]     *= 4;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            velec            = _fjsp_mul_v2r8(qq12,VV);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq12,FF),_fjsp_mul_v2r8(vftabscale,rinv12)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+            
+            fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r20              = _fjsp_mul_v2r8(rsq20,rinv20);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r20,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 4;
+            vfconv.i[1]     *= 4;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            velec            = _fjsp_mul_v2r8(qq20,VV);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq20,FF),_fjsp_mul_v2r8(vftabscale,rinv20)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r21              = _fjsp_mul_v2r8(rsq21,rinv21);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r21,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 4;
+            vfconv.i[1]     *= 4;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            velec            = _fjsp_mul_v2r8(qq21,VV);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq21,FF),_fjsp_mul_v2r8(vftabscale,rinv21)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+            
+            fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r22              = _fjsp_mul_v2r8(rsq22,rinv22);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r22,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 4;
+            vfconv.i[1]     *= 4;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            velec            = _fjsp_mul_v2r8(qq22,VV);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq22,FF),_fjsp_mul_v2r8(vftabscale,rinv22)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+            
+            fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+
+            gmx_fjsp_decrement_3rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+
+            /* Inner loop uses 427 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_fjsp_update_iforce_3atom_swizzle_v2r8(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        ggid                        = gid[iidx];
+        /* Update potential energies */
+        gmx_fjsp_update_1pot_v2r8(velecsum,kernel_data->energygrp_elec+ggid);
+        gmx_fjsp_update_1pot_v2r8(vvdwsum,kernel_data->energygrp_vdw+ggid);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 20 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_VF,outeriter*20 + inneriter*427);
+}
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecCSTab_VdwLJ_GeomW3W3_F_sparc64_hpc_ace_double
+ * Electrostatics interaction: CubicSplineTable
+ * VdW interaction:            LennardJones
+ * Geometry:                   Water3-Water3
+ * Calculate force/pot:        Force
+ */
+void
+nb_kernel_ElecCSTab_VdwLJ_GeomW3W3_F_sparc64_hpc_ace_double
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwjidx0A,vdwjidx0B;
+    _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    int              vdwjidx1A,vdwjidx1B;
+    _fjsp_v2r8       jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+    int              vdwjidx2A,vdwjidx2B;
+    _fjsp_v2r8       jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+    _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    _fjsp_v2r8       dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
+    _fjsp_v2r8       dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
+    _fjsp_v2r8       dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    _fjsp_v2r8       dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+    _fjsp_v2r8       dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+    _fjsp_v2r8       dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    _fjsp_v2r8       dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+    _fjsp_v2r8       dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+    _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
+    _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
+    _fjsp_v2r8       rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF,twovfeps;
+    real             *vftab;
+    _fjsp_v2r8       itab_tmp;
+    _fjsp_v2r8       dummy_mask,cutoff_mask;
+    _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+    _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+    union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+
+    vftab            = kernel_data->table_elec->data;
+    vftabscale       = gmx_fjsp_set1_v2r8(kernel_data->table_elec->scale);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq0              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+0]));
+    iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+    iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+    vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+    jq0              = gmx_fjsp_set1_v2r8(charge[inr+0]);
+    jq1              = gmx_fjsp_set1_v2r8(charge[inr+1]);
+    jq2              = gmx_fjsp_set1_v2r8(charge[inr+2]);
+    vdwjidx0A        = 2*vdwtype[inr+0];
+    qq00             = _fjsp_mul_v2r8(iq0,jq0);
+    c6_00            = gmx_fjsp_set1_v2r8(vdwparam[vdwioffset0+vdwjidx0A]);
+    c12_00           = gmx_fjsp_set1_v2r8(vdwparam[vdwioffset0+vdwjidx0A+1]);
+    qq01             = _fjsp_mul_v2r8(iq0,jq1);
+    qq02             = _fjsp_mul_v2r8(iq0,jq2);
+    qq10             = _fjsp_mul_v2r8(iq1,jq0);
+    qq11             = _fjsp_mul_v2r8(iq1,jq1);
+    qq12             = _fjsp_mul_v2r8(iq1,jq2);
+    qq20             = _fjsp_mul_v2r8(iq2,jq0);
+    qq21             = _fjsp_mul_v2r8(iq2,jq1);
+    qq22             = _fjsp_mul_v2r8(iq2,jq2);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_fjsp_load_shift_and_3rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
+
+        fix0             = _fjsp_setzero_v2r8();
+        fiy0             = _fjsp_setzero_v2r8();
+        fiz0             = _fjsp_setzero_v2r8();
+        fix1             = _fjsp_setzero_v2r8();
+        fiy1             = _fjsp_setzero_v2r8();
+        fiz1             = _fjsp_setzero_v2r8();
+        fix2             = _fjsp_setzero_v2r8();
+        fiy2             = _fjsp_setzero_v2r8();
+        fiz2             = _fjsp_setzero_v2r8();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_3rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx01             = _fjsp_sub_v2r8(ix0,jx1);
+            dy01             = _fjsp_sub_v2r8(iy0,jy1);
+            dz01             = _fjsp_sub_v2r8(iz0,jz1);
+            dx02             = _fjsp_sub_v2r8(ix0,jx2);
+            dy02             = _fjsp_sub_v2r8(iy0,jy2);
+            dz02             = _fjsp_sub_v2r8(iz0,jz2);
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx11             = _fjsp_sub_v2r8(ix1,jx1);
+            dy11             = _fjsp_sub_v2r8(iy1,jy1);
+            dz11             = _fjsp_sub_v2r8(iz1,jz1);
+            dx12             = _fjsp_sub_v2r8(ix1,jx2);
+            dy12             = _fjsp_sub_v2r8(iy1,jy2);
+            dz12             = _fjsp_sub_v2r8(iz1,jz2);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+            dx21             = _fjsp_sub_v2r8(ix2,jx1);
+            dy21             = _fjsp_sub_v2r8(iy2,jy1);
+            dz21             = _fjsp_sub_v2r8(iz2,jz1);
+            dx22             = _fjsp_sub_v2r8(ix2,jx2);
+            dy22             = _fjsp_sub_v2r8(iy2,jy2);
+            dz22             = _fjsp_sub_v2r8(iz2,jz2);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq01            = gmx_fjsp_calc_rsq_v2r8(dx01,dy01,dz01);
+            rsq02            = gmx_fjsp_calc_rsq_v2r8(dx02,dy02,dz02);
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+            rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+            rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+            rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv01           = gmx_fjsp_invsqrt_v2r8(rsq01);
+            rinv02           = gmx_fjsp_invsqrt_v2r8(rsq02);
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+            rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+            rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+            rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+            fjx1             = _fjsp_setzero_v2r8();
+            fjy1             = _fjsp_setzero_v2r8();
+            fjz1             = _fjsp_setzero_v2r8();
+            fjx2             = _fjsp_setzero_v2r8();
+            fjy2             = _fjsp_setzero_v2r8();
+            fjz2             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r00,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 4;
+            vfconv.i[1]     *= 4;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq00,FF),_fjsp_mul_v2r8(vftabscale,rinv00)));
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+            fvdw             = _fjsp_mul_v2r8(_fjsp_msub_v2r8(c12_00,rinvsix,c6_00),_fjsp_mul_v2r8(rinvsix,rinvsq00));
+
+            fscal            = _fjsp_add_v2r8(felec,fvdw);
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r01              = _fjsp_mul_v2r8(rsq01,rinv01);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r01,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 4;
+            vfconv.i[1]     *= 4;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq01,FF),_fjsp_mul_v2r8(vftabscale,rinv01)));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx01,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy01,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz01,fscal,fiz0);
+            
+            fjx1             = _fjsp_madd_v2r8(dx01,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy01,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz01,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r02              = _fjsp_mul_v2r8(rsq02,rinv02);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r02,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 4;
+            vfconv.i[1]     *= 4;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq02,FF),_fjsp_mul_v2r8(vftabscale,rinv02)));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx02,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy02,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz02,fscal,fiz0);
+            
+            fjx2             = _fjsp_madd_v2r8(dx02,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy02,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz02,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r10              = _fjsp_mul_v2r8(rsq10,rinv10);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r10,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 4;
+            vfconv.i[1]     *= 4;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq10,FF),_fjsp_mul_v2r8(vftabscale,rinv10)));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r11              = _fjsp_mul_v2r8(rsq11,rinv11);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r11,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 4;
+            vfconv.i[1]     *= 4;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq11,FF),_fjsp_mul_v2r8(vftabscale,rinv11)));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+            
+            fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r12              = _fjsp_mul_v2r8(rsq12,rinv12);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r12,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 4;
+            vfconv.i[1]     *= 4;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq12,FF),_fjsp_mul_v2r8(vftabscale,rinv12)));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+            
+            fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r20              = _fjsp_mul_v2r8(rsq20,rinv20);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r20,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 4;
+            vfconv.i[1]     *= 4;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq20,FF),_fjsp_mul_v2r8(vftabscale,rinv20)));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r21              = _fjsp_mul_v2r8(rsq21,rinv21);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r21,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 4;
+            vfconv.i[1]     *= 4;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq21,FF),_fjsp_mul_v2r8(vftabscale,rinv21)));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+            
+            fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r22              = _fjsp_mul_v2r8(rsq22,rinv22);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r22,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 4;
+            vfconv.i[1]     *= 4;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq22,FF),_fjsp_mul_v2r8(vftabscale,rinv22)));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+            
+            fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+
+            gmx_fjsp_decrement_3rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+
+            /* Inner loop uses 386 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_3rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx01             = _fjsp_sub_v2r8(ix0,jx1);
+            dy01             = _fjsp_sub_v2r8(iy0,jy1);
+            dz01             = _fjsp_sub_v2r8(iz0,jz1);
+            dx02             = _fjsp_sub_v2r8(ix0,jx2);
+            dy02             = _fjsp_sub_v2r8(iy0,jy2);
+            dz02             = _fjsp_sub_v2r8(iz0,jz2);
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx11             = _fjsp_sub_v2r8(ix1,jx1);
+            dy11             = _fjsp_sub_v2r8(iy1,jy1);
+            dz11             = _fjsp_sub_v2r8(iz1,jz1);
+            dx12             = _fjsp_sub_v2r8(ix1,jx2);
+            dy12             = _fjsp_sub_v2r8(iy1,jy2);
+            dz12             = _fjsp_sub_v2r8(iz1,jz2);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+            dx21             = _fjsp_sub_v2r8(ix2,jx1);
+            dy21             = _fjsp_sub_v2r8(iy2,jy1);
+            dz21             = _fjsp_sub_v2r8(iz2,jz1);
+            dx22             = _fjsp_sub_v2r8(ix2,jx2);
+            dy22             = _fjsp_sub_v2r8(iy2,jy2);
+            dz22             = _fjsp_sub_v2r8(iz2,jz2);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq01            = gmx_fjsp_calc_rsq_v2r8(dx01,dy01,dz01);
+            rsq02            = gmx_fjsp_calc_rsq_v2r8(dx02,dy02,dz02);
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+            rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+            rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+            rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv01           = gmx_fjsp_invsqrt_v2r8(rsq01);
+            rinv02           = gmx_fjsp_invsqrt_v2r8(rsq02);
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+            rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+            rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+            rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+            fjx1             = _fjsp_setzero_v2r8();
+            fjy1             = _fjsp_setzero_v2r8();
+            fjz1             = _fjsp_setzero_v2r8();
+            fjx2             = _fjsp_setzero_v2r8();
+            fjy2             = _fjsp_setzero_v2r8();
+            fjz2             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r00,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 4;
+            vfconv.i[1]     *= 4;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq00,FF),_fjsp_mul_v2r8(vftabscale,rinv00)));
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+            fvdw             = _fjsp_mul_v2r8(_fjsp_msub_v2r8(c12_00,rinvsix,c6_00),_fjsp_mul_v2r8(rinvsix,rinvsq00));
+
+            fscal            = _fjsp_add_v2r8(felec,fvdw);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r01              = _fjsp_mul_v2r8(rsq01,rinv01);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r01,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 4;
+            vfconv.i[1]     *= 4;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq01,FF),_fjsp_mul_v2r8(vftabscale,rinv01)));
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx01,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy01,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz01,fscal,fiz0);
+            
+            fjx1             = _fjsp_madd_v2r8(dx01,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy01,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz01,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r02              = _fjsp_mul_v2r8(rsq02,rinv02);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r02,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 4;
+            vfconv.i[1]     *= 4;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq02,FF),_fjsp_mul_v2r8(vftabscale,rinv02)));
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx02,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy02,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz02,fscal,fiz0);
+            
+            fjx2             = _fjsp_madd_v2r8(dx02,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy02,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz02,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r10              = _fjsp_mul_v2r8(rsq10,rinv10);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r10,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 4;
+            vfconv.i[1]     *= 4;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq10,FF),_fjsp_mul_v2r8(vftabscale,rinv10)));
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r11              = _fjsp_mul_v2r8(rsq11,rinv11);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r11,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 4;
+            vfconv.i[1]     *= 4;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq11,FF),_fjsp_mul_v2r8(vftabscale,rinv11)));
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+            
+            fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r12              = _fjsp_mul_v2r8(rsq12,rinv12);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r12,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 4;
+            vfconv.i[1]     *= 4;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq12,FF),_fjsp_mul_v2r8(vftabscale,rinv12)));
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+            
+            fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r20              = _fjsp_mul_v2r8(rsq20,rinv20);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r20,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 4;
+            vfconv.i[1]     *= 4;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq20,FF),_fjsp_mul_v2r8(vftabscale,rinv20)));
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r21              = _fjsp_mul_v2r8(rsq21,rinv21);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r21,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 4;
+            vfconv.i[1]     *= 4;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq21,FF),_fjsp_mul_v2r8(vftabscale,rinv21)));
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+            
+            fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r22              = _fjsp_mul_v2r8(rsq22,rinv22);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r22,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 4;
+            vfconv.i[1]     *= 4;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq22,FF),_fjsp_mul_v2r8(vftabscale,rinv22)));
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+            
+            fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+
+            gmx_fjsp_decrement_3rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+
+            /* Inner loop uses 386 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_fjsp_update_iforce_3atom_swizzle_v2r8(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 18 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_F,outeriter*18 + inneriter*386);
+}
diff --git a/src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecCSTab_VdwLJ_GeomW4P1_sparc64_hpc_ace_double.c b/src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecCSTab_VdwLJ_GeomW4P1_sparc64_hpc_ace_double.c
new file mode 100644 (file)
index 0000000..635418c
--- /dev/null
@@ -0,0 +1,1201 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2012, by the GROMACS development team, led by
+ * David van der Spoel, Berk Hess, Erik Lindahl, and including many
+ * others, as listed in the AUTHORS file in the top-level source
+ * directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*
+ * Note: this file was generated by the GROMACS sparc64_hpc_ace_double kernel generator.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "kernelutil_sparc64_hpc_ace_double.h"
+
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecCSTab_VdwLJ_GeomW4P1_VF_sparc64_hpc_ace_double
+ * Electrostatics interaction: CubicSplineTable
+ * VdW interaction:            LennardJones
+ * Geometry:                   Water4-Particle
+ * Calculate force/pot:        PotentialAndForce
+ */
+void
+nb_kernel_ElecCSTab_VdwLJ_GeomW4P1_VF_sparc64_hpc_ace_double
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwioffset3;
+    _fjsp_v2r8       ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+    int              vdwjidx0A,vdwjidx0B;
+    _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    _fjsp_v2r8       dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    _fjsp_v2r8       dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    _fjsp_v2r8       dx30,dy30,dz30,rsq30,rinv30,rinvsq30,r30,qq30,c6_30,c12_30;
+    _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
+    _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
+    _fjsp_v2r8       rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF,twovfeps;
+    real             *vftab;
+    _fjsp_v2r8       itab_tmp;
+    _fjsp_v2r8       dummy_mask,cutoff_mask;
+    _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+    _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+    union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+
+    vftab            = kernel_data->table_elec->data;
+    vftabscale       = gmx_fjsp_set1_v2r8(kernel_data->table_elec->scale);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+    iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+    iq3              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+3]));
+    vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_fjsp_load_shift_and_4rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
+
+        fix0             = _fjsp_setzero_v2r8();
+        fiy0             = _fjsp_setzero_v2r8();
+        fiz0             = _fjsp_setzero_v2r8();
+        fix1             = _fjsp_setzero_v2r8();
+        fiy1             = _fjsp_setzero_v2r8();
+        fiz1             = _fjsp_setzero_v2r8();
+        fix2             = _fjsp_setzero_v2r8();
+        fiy2             = _fjsp_setzero_v2r8();
+        fiz2             = _fjsp_setzero_v2r8();
+        fix3             = _fjsp_setzero_v2r8();
+        fiy3             = _fjsp_setzero_v2r8();
+        fiz3             = _fjsp_setzero_v2r8();
+
+        /* Reset potential sums */
+        velecsum         = _fjsp_setzero_v2r8();
+        vvdwsum          = _fjsp_setzero_v2r8();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+            dx30             = _fjsp_sub_v2r8(ix3,jx0);
+            dy30             = _fjsp_sub_v2r8(iy3,jy0);
+            dz30             = _fjsp_sub_v2r8(iz3,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+            rsq30            = gmx_fjsp_calc_rsq_v2r8(dx30,dy30,dz30);
+
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+            rinv30           = gmx_fjsp_invsqrt_v2r8(rsq30);
+
+            rinvsq00         = gmx_fjsp_inv_v2r8(rsq00);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_fjsp_load_2real_swizzle_v2r8(charge+jnrA+0,charge+jnrB+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            gmx_fjsp_load_2pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,&c6_00,&c12_00);
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+            vvdw6            = _fjsp_mul_v2r8(c6_00,rinvsix);
+            vvdw12           = _fjsp_mul_v2r8(c12_00,_fjsp_mul_v2r8(rinvsix,rinvsix));
+            vvdw             = _fjsp_msub_v2r8( vvdw12,one_twelfth, _fjsp_mul_v2r8(vvdw6,one_sixth) );
+            fvdw             = _fjsp_mul_v2r8(_fjsp_sub_v2r8(vvdw12,vvdw6),rinvsq00);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
+
+            fscal            = fvdw;
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r10              = _fjsp_mul_v2r8(rsq10,rinv10);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _fjsp_mul_v2r8(iq1,jq0);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r10,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 4;
+            vfconv.i[1]     *= 4;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            velec            = _fjsp_mul_v2r8(qq10,VV);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq10,FF),_fjsp_mul_v2r8(vftabscale,rinv10)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r20              = _fjsp_mul_v2r8(rsq20,rinv20);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _fjsp_mul_v2r8(iq2,jq0);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r20,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 4;
+            vfconv.i[1]     *= 4;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            velec            = _fjsp_mul_v2r8(qq20,VV);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq20,FF),_fjsp_mul_v2r8(vftabscale,rinv20)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r30              = _fjsp_mul_v2r8(rsq30,rinv30);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq30             = _fjsp_mul_v2r8(iq3,jq0);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r30,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 4;
+            vfconv.i[1]     *= 4;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            velec            = _fjsp_mul_v2r8(qq30,VV);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq30,FF),_fjsp_mul_v2r8(vftabscale,rinv30)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx30,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy30,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz30,fscal,fiz3);
+            
+            fjx0             = _fjsp_madd_v2r8(dx30,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy30,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz30,fscal,fjz0);
+
+            gmx_fjsp_decrement_1rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 176 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+            dx30             = _fjsp_sub_v2r8(ix3,jx0);
+            dy30             = _fjsp_sub_v2r8(iy3,jy0);
+            dz30             = _fjsp_sub_v2r8(iz3,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+            rsq30            = gmx_fjsp_calc_rsq_v2r8(dx30,dy30,dz30);
+
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+            rinv30           = gmx_fjsp_invsqrt_v2r8(rsq30);
+
+            rinvsq00         = gmx_fjsp_inv_v2r8(rsq00);
+
+            /* Load parameters for j particles */
+            jq0              = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),charge+jnrA+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            gmx_fjsp_load_1pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,&c6_00,&c12_00);
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+            vvdw6            = _fjsp_mul_v2r8(c6_00,rinvsix);
+            vvdw12           = _fjsp_mul_v2r8(c12_00,_fjsp_mul_v2r8(rinvsix,rinvsix));
+            vvdw             = _fjsp_msub_v2r8( vvdw12,one_twelfth, _fjsp_mul_v2r8(vvdw6,one_sixth) );
+            fvdw             = _fjsp_mul_v2r8(_fjsp_sub_v2r8(vvdw12,vvdw6),rinvsq00);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            vvdw             = _fjsp_unpacklo_v2r8(vvdw,_fjsp_setzero_v2r8());
+            vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
+
+            fscal            = fvdw;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r10              = _fjsp_mul_v2r8(rsq10,rinv10);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _fjsp_mul_v2r8(iq1,jq0);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r10,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 4;
+            vfconv.i[1]     *= 4;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            velec            = _fjsp_mul_v2r8(qq10,VV);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq10,FF),_fjsp_mul_v2r8(vftabscale,rinv10)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r20              = _fjsp_mul_v2r8(rsq20,rinv20);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _fjsp_mul_v2r8(iq2,jq0);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r20,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 4;
+            vfconv.i[1]     *= 4;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            velec            = _fjsp_mul_v2r8(qq20,VV);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq20,FF),_fjsp_mul_v2r8(vftabscale,rinv20)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r30              = _fjsp_mul_v2r8(rsq30,rinv30);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq30             = _fjsp_mul_v2r8(iq3,jq0);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r30,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 4;
+            vfconv.i[1]     *= 4;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            velec            = _fjsp_mul_v2r8(qq30,VV);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq30,FF),_fjsp_mul_v2r8(vftabscale,rinv30)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx30,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy30,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz30,fscal,fiz3);
+            
+            fjx0             = _fjsp_madd_v2r8(dx30,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy30,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz30,fscal,fjz0);
+
+            gmx_fjsp_decrement_1rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 176 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_fjsp_update_iforce_4atom_swizzle_v2r8(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        ggid                        = gid[iidx];
+        /* Update potential energies */
+        gmx_fjsp_update_1pot_v2r8(velecsum,kernel_data->energygrp_elec+ggid);
+        gmx_fjsp_update_1pot_v2r8(vvdwsum,kernel_data->energygrp_vdw+ggid);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 26 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4_VF,outeriter*26 + inneriter*176);
+}
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecCSTab_VdwLJ_GeomW4P1_F_sparc64_hpc_ace_double
+ * Electrostatics interaction: CubicSplineTable
+ * VdW interaction:            LennardJones
+ * Geometry:                   Water4-Particle
+ * Calculate force/pot:        Force
+ */
+void
+nb_kernel_ElecCSTab_VdwLJ_GeomW4P1_F_sparc64_hpc_ace_double
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwioffset3;
+    _fjsp_v2r8       ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+    int              vdwjidx0A,vdwjidx0B;
+    _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    _fjsp_v2r8       dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    _fjsp_v2r8       dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    _fjsp_v2r8       dx30,dy30,dz30,rsq30,rinv30,rinvsq30,r30,qq30,c6_30,c12_30;
+    _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
+    _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
+    _fjsp_v2r8       rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF,twovfeps;
+    real             *vftab;
+    _fjsp_v2r8       itab_tmp;
+    _fjsp_v2r8       dummy_mask,cutoff_mask;
+    _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+    _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+    union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+
+    vftab            = kernel_data->table_elec->data;
+    vftabscale       = gmx_fjsp_set1_v2r8(kernel_data->table_elec->scale);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+    iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+    iq3              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+3]));
+    vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_fjsp_load_shift_and_4rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
+
+        fix0             = _fjsp_setzero_v2r8();
+        fiy0             = _fjsp_setzero_v2r8();
+        fiz0             = _fjsp_setzero_v2r8();
+        fix1             = _fjsp_setzero_v2r8();
+        fiy1             = _fjsp_setzero_v2r8();
+        fiz1             = _fjsp_setzero_v2r8();
+        fix2             = _fjsp_setzero_v2r8();
+        fiy2             = _fjsp_setzero_v2r8();
+        fiz2             = _fjsp_setzero_v2r8();
+        fix3             = _fjsp_setzero_v2r8();
+        fiy3             = _fjsp_setzero_v2r8();
+        fiz3             = _fjsp_setzero_v2r8();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+            dx30             = _fjsp_sub_v2r8(ix3,jx0);
+            dy30             = _fjsp_sub_v2r8(iy3,jy0);
+            dz30             = _fjsp_sub_v2r8(iz3,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+            rsq30            = gmx_fjsp_calc_rsq_v2r8(dx30,dy30,dz30);
+
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+            rinv30           = gmx_fjsp_invsqrt_v2r8(rsq30);
+
+            rinvsq00         = gmx_fjsp_inv_v2r8(rsq00);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_fjsp_load_2real_swizzle_v2r8(charge+jnrA+0,charge+jnrB+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            gmx_fjsp_load_2pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,&c6_00,&c12_00);
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+            fvdw             = _fjsp_mul_v2r8(_fjsp_msub_v2r8(c12_00,rinvsix,c6_00),_fjsp_mul_v2r8(rinvsix,rinvsq00));
+
+            fscal            = fvdw;
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r10              = _fjsp_mul_v2r8(rsq10,rinv10);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _fjsp_mul_v2r8(iq1,jq0);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r10,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 4;
+            vfconv.i[1]     *= 4;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq10,FF),_fjsp_mul_v2r8(vftabscale,rinv10)));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r20              = _fjsp_mul_v2r8(rsq20,rinv20);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _fjsp_mul_v2r8(iq2,jq0);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r20,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 4;
+            vfconv.i[1]     *= 4;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq20,FF),_fjsp_mul_v2r8(vftabscale,rinv20)));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r30              = _fjsp_mul_v2r8(rsq30,rinv30);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq30             = _fjsp_mul_v2r8(iq3,jq0);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r30,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 4;
+            vfconv.i[1]     *= 4;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq30,FF),_fjsp_mul_v2r8(vftabscale,rinv30)));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx30,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy30,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz30,fscal,fiz3);
+            
+            fjx0             = _fjsp_madd_v2r8(dx30,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy30,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz30,fscal,fjz0);
+
+            gmx_fjsp_decrement_1rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 159 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+            dx30             = _fjsp_sub_v2r8(ix3,jx0);
+            dy30             = _fjsp_sub_v2r8(iy3,jy0);
+            dz30             = _fjsp_sub_v2r8(iz3,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+            rsq30            = gmx_fjsp_calc_rsq_v2r8(dx30,dy30,dz30);
+
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+            rinv30           = gmx_fjsp_invsqrt_v2r8(rsq30);
+
+            rinvsq00         = gmx_fjsp_inv_v2r8(rsq00);
+
+            /* Load parameters for j particles */
+            jq0              = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),charge+jnrA+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            gmx_fjsp_load_1pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,&c6_00,&c12_00);
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+            fvdw             = _fjsp_mul_v2r8(_fjsp_msub_v2r8(c12_00,rinvsix,c6_00),_fjsp_mul_v2r8(rinvsix,rinvsq00));
+
+            fscal            = fvdw;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r10              = _fjsp_mul_v2r8(rsq10,rinv10);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _fjsp_mul_v2r8(iq1,jq0);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r10,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 4;
+            vfconv.i[1]     *= 4;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq10,FF),_fjsp_mul_v2r8(vftabscale,rinv10)));
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r20              = _fjsp_mul_v2r8(rsq20,rinv20);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _fjsp_mul_v2r8(iq2,jq0);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r20,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 4;
+            vfconv.i[1]     *= 4;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq20,FF),_fjsp_mul_v2r8(vftabscale,rinv20)));
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r30              = _fjsp_mul_v2r8(rsq30,rinv30);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq30             = _fjsp_mul_v2r8(iq3,jq0);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r30,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 4;
+            vfconv.i[1]     *= 4;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq30,FF),_fjsp_mul_v2r8(vftabscale,rinv30)));
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx30,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy30,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz30,fscal,fiz3);
+            
+            fjx0             = _fjsp_madd_v2r8(dx30,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy30,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz30,fscal,fjz0);
+
+            gmx_fjsp_decrement_1rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 159 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_fjsp_update_iforce_4atom_swizzle_v2r8(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 24 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4_F,outeriter*24 + inneriter*159);
+}
diff --git a/src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecCSTab_VdwLJ_GeomW4W4_sparc64_hpc_ace_double.c b/src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecCSTab_VdwLJ_GeomW4W4_sparc64_hpc_ace_double.c
new file mode 100644 (file)
index 0000000..4c49563
--- /dev/null
@@ -0,0 +1,2351 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2012, by the GROMACS development team, led by
+ * David van der Spoel, Berk Hess, Erik Lindahl, and including many
+ * others, as listed in the AUTHORS file in the top-level source
+ * directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*
+ * Note: this file was generated by the GROMACS sparc64_hpc_ace_double kernel generator.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "kernelutil_sparc64_hpc_ace_double.h"
+
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecCSTab_VdwLJ_GeomW4W4_VF_sparc64_hpc_ace_double
+ * Electrostatics interaction: CubicSplineTable
+ * VdW interaction:            LennardJones
+ * Geometry:                   Water4-Water4
+ * Calculate force/pot:        PotentialAndForce
+ */
+void
+nb_kernel_ElecCSTab_VdwLJ_GeomW4W4_VF_sparc64_hpc_ace_double
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwioffset3;
+    _fjsp_v2r8       ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+    int              vdwjidx0A,vdwjidx0B;
+    _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    int              vdwjidx1A,vdwjidx1B;
+    _fjsp_v2r8       jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+    int              vdwjidx2A,vdwjidx2B;
+    _fjsp_v2r8       jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+    int              vdwjidx3A,vdwjidx3B;
+    _fjsp_v2r8       jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
+    _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    _fjsp_v2r8       dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+    _fjsp_v2r8       dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+    _fjsp_v2r8       dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
+    _fjsp_v2r8       dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+    _fjsp_v2r8       dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+    _fjsp_v2r8       dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
+    _fjsp_v2r8       dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
+    _fjsp_v2r8       dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
+    _fjsp_v2r8       dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
+    _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
+    _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
+    _fjsp_v2r8       rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF,twovfeps;
+    real             *vftab;
+    _fjsp_v2r8       itab_tmp;
+    _fjsp_v2r8       dummy_mask,cutoff_mask;
+    _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+    _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+    union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+
+    vftab            = kernel_data->table_elec->data;
+    vftabscale       = gmx_fjsp_set1_v2r8(kernel_data->table_elec->scale);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+    iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+    iq3              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+3]));
+    vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+    jq1              = gmx_fjsp_set1_v2r8(charge[inr+1]);
+    jq2              = gmx_fjsp_set1_v2r8(charge[inr+2]);
+    jq3              = gmx_fjsp_set1_v2r8(charge[inr+3]);
+    vdwjidx0A        = 2*vdwtype[inr+0];
+    c6_00            = gmx_fjsp_set1_v2r8(vdwparam[vdwioffset0+vdwjidx0A]);
+    c12_00           = gmx_fjsp_set1_v2r8(vdwparam[vdwioffset0+vdwjidx0A+1]);
+    qq11             = _fjsp_mul_v2r8(iq1,jq1);
+    qq12             = _fjsp_mul_v2r8(iq1,jq2);
+    qq13             = _fjsp_mul_v2r8(iq1,jq3);
+    qq21             = _fjsp_mul_v2r8(iq2,jq1);
+    qq22             = _fjsp_mul_v2r8(iq2,jq2);
+    qq23             = _fjsp_mul_v2r8(iq2,jq3);
+    qq31             = _fjsp_mul_v2r8(iq3,jq1);
+    qq32             = _fjsp_mul_v2r8(iq3,jq2);
+    qq33             = _fjsp_mul_v2r8(iq3,jq3);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_fjsp_load_shift_and_4rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
+
+        fix0             = _fjsp_setzero_v2r8();
+        fiy0             = _fjsp_setzero_v2r8();
+        fiz0             = _fjsp_setzero_v2r8();
+        fix1             = _fjsp_setzero_v2r8();
+        fiy1             = _fjsp_setzero_v2r8();
+        fiz1             = _fjsp_setzero_v2r8();
+        fix2             = _fjsp_setzero_v2r8();
+        fiy2             = _fjsp_setzero_v2r8();
+        fiz2             = _fjsp_setzero_v2r8();
+        fix3             = _fjsp_setzero_v2r8();
+        fiy3             = _fjsp_setzero_v2r8();
+        fiz3             = _fjsp_setzero_v2r8();
+
+        /* Reset potential sums */
+        velecsum         = _fjsp_setzero_v2r8();
+        vvdwsum          = _fjsp_setzero_v2r8();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_4rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
+                                              &jy2,&jz2,&jx3,&jy3,&jz3);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx11             = _fjsp_sub_v2r8(ix1,jx1);
+            dy11             = _fjsp_sub_v2r8(iy1,jy1);
+            dz11             = _fjsp_sub_v2r8(iz1,jz1);
+            dx12             = _fjsp_sub_v2r8(ix1,jx2);
+            dy12             = _fjsp_sub_v2r8(iy1,jy2);
+            dz12             = _fjsp_sub_v2r8(iz1,jz2);
+            dx13             = _fjsp_sub_v2r8(ix1,jx3);
+            dy13             = _fjsp_sub_v2r8(iy1,jy3);
+            dz13             = _fjsp_sub_v2r8(iz1,jz3);
+            dx21             = _fjsp_sub_v2r8(ix2,jx1);
+            dy21             = _fjsp_sub_v2r8(iy2,jy1);
+            dz21             = _fjsp_sub_v2r8(iz2,jz1);
+            dx22             = _fjsp_sub_v2r8(ix2,jx2);
+            dy22             = _fjsp_sub_v2r8(iy2,jy2);
+            dz22             = _fjsp_sub_v2r8(iz2,jz2);
+            dx23             = _fjsp_sub_v2r8(ix2,jx3);
+            dy23             = _fjsp_sub_v2r8(iy2,jy3);
+            dz23             = _fjsp_sub_v2r8(iz2,jz3);
+            dx31             = _fjsp_sub_v2r8(ix3,jx1);
+            dy31             = _fjsp_sub_v2r8(iy3,jy1);
+            dz31             = _fjsp_sub_v2r8(iz3,jz1);
+            dx32             = _fjsp_sub_v2r8(ix3,jx2);
+            dy32             = _fjsp_sub_v2r8(iy3,jy2);
+            dz32             = _fjsp_sub_v2r8(iz3,jz2);
+            dx33             = _fjsp_sub_v2r8(ix3,jx3);
+            dy33             = _fjsp_sub_v2r8(iy3,jy3);
+            dz33             = _fjsp_sub_v2r8(iz3,jz3);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+            rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+            rsq13            = gmx_fjsp_calc_rsq_v2r8(dx13,dy13,dz13);
+            rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+            rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+            rsq23            = gmx_fjsp_calc_rsq_v2r8(dx23,dy23,dz23);
+            rsq31            = gmx_fjsp_calc_rsq_v2r8(dx31,dy31,dz31);
+            rsq32            = gmx_fjsp_calc_rsq_v2r8(dx32,dy32,dz32);
+            rsq33            = gmx_fjsp_calc_rsq_v2r8(dx33,dy33,dz33);
+
+            rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+            rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+            rinv13           = gmx_fjsp_invsqrt_v2r8(rsq13);
+            rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+            rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+            rinv23           = gmx_fjsp_invsqrt_v2r8(rsq23);
+            rinv31           = gmx_fjsp_invsqrt_v2r8(rsq31);
+            rinv32           = gmx_fjsp_invsqrt_v2r8(rsq32);
+            rinv33           = gmx_fjsp_invsqrt_v2r8(rsq33);
+
+            rinvsq00         = gmx_fjsp_inv_v2r8(rsq00);
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+            fjx1             = _fjsp_setzero_v2r8();
+            fjy1             = _fjsp_setzero_v2r8();
+            fjz1             = _fjsp_setzero_v2r8();
+            fjx2             = _fjsp_setzero_v2r8();
+            fjy2             = _fjsp_setzero_v2r8();
+            fjz2             = _fjsp_setzero_v2r8();
+            fjx3             = _fjsp_setzero_v2r8();
+            fjy3             = _fjsp_setzero_v2r8();
+            fjz3             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+            vvdw6            = _fjsp_mul_v2r8(c6_00,rinvsix);
+            vvdw12           = _fjsp_mul_v2r8(c12_00,_fjsp_mul_v2r8(rinvsix,rinvsix));
+            vvdw             = _fjsp_msub_v2r8( vvdw12,one_twelfth, _fjsp_mul_v2r8(vvdw6,one_sixth) );
+            fvdw             = _fjsp_mul_v2r8(_fjsp_sub_v2r8(vvdw12,vvdw6),rinvsq00);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
+
+            fscal            = fvdw;
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r11              = _fjsp_mul_v2r8(rsq11,rinv11);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r11,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 4;
+            vfconv.i[1]     *= 4;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            velec            = _fjsp_mul_v2r8(qq11,VV);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq11,FF),_fjsp_mul_v2r8(vftabscale,rinv11)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+            
+            fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r12              = _fjsp_mul_v2r8(rsq12,rinv12);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r12,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 4;
+            vfconv.i[1]     *= 4;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            velec            = _fjsp_mul_v2r8(qq12,VV);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq12,FF),_fjsp_mul_v2r8(vftabscale,rinv12)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+            
+            fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r13              = _fjsp_mul_v2r8(rsq13,rinv13);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r13,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 4;
+            vfconv.i[1]     *= 4;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            velec            = _fjsp_mul_v2r8(qq13,VV);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq13,FF),_fjsp_mul_v2r8(vftabscale,rinv13)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx13,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy13,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz13,fscal,fiz1);
+            
+            fjx3             = _fjsp_madd_v2r8(dx13,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy13,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz13,fscal,fjz3);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r21              = _fjsp_mul_v2r8(rsq21,rinv21);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r21,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 4;
+            vfconv.i[1]     *= 4;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            velec            = _fjsp_mul_v2r8(qq21,VV);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq21,FF),_fjsp_mul_v2r8(vftabscale,rinv21)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+            
+            fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r22              = _fjsp_mul_v2r8(rsq22,rinv22);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r22,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 4;
+            vfconv.i[1]     *= 4;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            velec            = _fjsp_mul_v2r8(qq22,VV);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq22,FF),_fjsp_mul_v2r8(vftabscale,rinv22)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+            
+            fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r23              = _fjsp_mul_v2r8(rsq23,rinv23);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r23,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 4;
+            vfconv.i[1]     *= 4;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            velec            = _fjsp_mul_v2r8(qq23,VV);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq23,FF),_fjsp_mul_v2r8(vftabscale,rinv23)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx23,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy23,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz23,fscal,fiz2);
+            
+            fjx3             = _fjsp_madd_v2r8(dx23,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy23,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz23,fscal,fjz3);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r31              = _fjsp_mul_v2r8(rsq31,rinv31);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r31,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 4;
+            vfconv.i[1]     *= 4;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            velec            = _fjsp_mul_v2r8(qq31,VV);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq31,FF),_fjsp_mul_v2r8(vftabscale,rinv31)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx31,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy31,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz31,fscal,fiz3);
+            
+            fjx1             = _fjsp_madd_v2r8(dx31,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy31,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz31,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r32              = _fjsp_mul_v2r8(rsq32,rinv32);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r32,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 4;
+            vfconv.i[1]     *= 4;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            velec            = _fjsp_mul_v2r8(qq32,VV);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq32,FF),_fjsp_mul_v2r8(vftabscale,rinv32)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx32,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy32,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz32,fscal,fiz3);
+            
+            fjx2             = _fjsp_madd_v2r8(dx32,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy32,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz32,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r33              = _fjsp_mul_v2r8(rsq33,rinv33);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r33,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 4;
+            vfconv.i[1]     *= 4;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            velec            = _fjsp_mul_v2r8(qq33,VV);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq33,FF),_fjsp_mul_v2r8(vftabscale,rinv33)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx33,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy33,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz33,fscal,fiz3);
+            
+            fjx3             = _fjsp_madd_v2r8(dx33,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy33,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz33,fscal,fjz3);
+
+            gmx_fjsp_decrement_4rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+
+            /* Inner loop uses 452 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_4rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
+                                              &jy2,&jz2,&jx3,&jy3,&jz3);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx11             = _fjsp_sub_v2r8(ix1,jx1);
+            dy11             = _fjsp_sub_v2r8(iy1,jy1);
+            dz11             = _fjsp_sub_v2r8(iz1,jz1);
+            dx12             = _fjsp_sub_v2r8(ix1,jx2);
+            dy12             = _fjsp_sub_v2r8(iy1,jy2);
+            dz12             = _fjsp_sub_v2r8(iz1,jz2);
+            dx13             = _fjsp_sub_v2r8(ix1,jx3);
+            dy13             = _fjsp_sub_v2r8(iy1,jy3);
+            dz13             = _fjsp_sub_v2r8(iz1,jz3);
+            dx21             = _fjsp_sub_v2r8(ix2,jx1);
+            dy21             = _fjsp_sub_v2r8(iy2,jy1);
+            dz21             = _fjsp_sub_v2r8(iz2,jz1);
+            dx22             = _fjsp_sub_v2r8(ix2,jx2);
+            dy22             = _fjsp_sub_v2r8(iy2,jy2);
+            dz22             = _fjsp_sub_v2r8(iz2,jz2);
+            dx23             = _fjsp_sub_v2r8(ix2,jx3);
+            dy23             = _fjsp_sub_v2r8(iy2,jy3);
+            dz23             = _fjsp_sub_v2r8(iz2,jz3);
+            dx31             = _fjsp_sub_v2r8(ix3,jx1);
+            dy31             = _fjsp_sub_v2r8(iy3,jy1);
+            dz31             = _fjsp_sub_v2r8(iz3,jz1);
+            dx32             = _fjsp_sub_v2r8(ix3,jx2);
+            dy32             = _fjsp_sub_v2r8(iy3,jy2);
+            dz32             = _fjsp_sub_v2r8(iz3,jz2);
+            dx33             = _fjsp_sub_v2r8(ix3,jx3);
+            dy33             = _fjsp_sub_v2r8(iy3,jy3);
+            dz33             = _fjsp_sub_v2r8(iz3,jz3);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+            rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+            rsq13            = gmx_fjsp_calc_rsq_v2r8(dx13,dy13,dz13);
+            rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+            rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+            rsq23            = gmx_fjsp_calc_rsq_v2r8(dx23,dy23,dz23);
+            rsq31            = gmx_fjsp_calc_rsq_v2r8(dx31,dy31,dz31);
+            rsq32            = gmx_fjsp_calc_rsq_v2r8(dx32,dy32,dz32);
+            rsq33            = gmx_fjsp_calc_rsq_v2r8(dx33,dy33,dz33);
+
+            rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+            rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+            rinv13           = gmx_fjsp_invsqrt_v2r8(rsq13);
+            rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+            rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+            rinv23           = gmx_fjsp_invsqrt_v2r8(rsq23);
+            rinv31           = gmx_fjsp_invsqrt_v2r8(rsq31);
+            rinv32           = gmx_fjsp_invsqrt_v2r8(rsq32);
+            rinv33           = gmx_fjsp_invsqrt_v2r8(rsq33);
+
+            rinvsq00         = gmx_fjsp_inv_v2r8(rsq00);
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+            fjx1             = _fjsp_setzero_v2r8();
+            fjy1             = _fjsp_setzero_v2r8();
+            fjz1             = _fjsp_setzero_v2r8();
+            fjx2             = _fjsp_setzero_v2r8();
+            fjy2             = _fjsp_setzero_v2r8();
+            fjz2             = _fjsp_setzero_v2r8();
+            fjx3             = _fjsp_setzero_v2r8();
+            fjy3             = _fjsp_setzero_v2r8();
+            fjz3             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+            vvdw6            = _fjsp_mul_v2r8(c6_00,rinvsix);
+            vvdw12           = _fjsp_mul_v2r8(c12_00,_fjsp_mul_v2r8(rinvsix,rinvsix));
+            vvdw             = _fjsp_msub_v2r8( vvdw12,one_twelfth, _fjsp_mul_v2r8(vvdw6,one_sixth) );
+            fvdw             = _fjsp_mul_v2r8(_fjsp_sub_v2r8(vvdw12,vvdw6),rinvsq00);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            vvdw             = _fjsp_unpacklo_v2r8(vvdw,_fjsp_setzero_v2r8());
+            vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
+
+            fscal            = fvdw;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r11              = _fjsp_mul_v2r8(rsq11,rinv11);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r11,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 4;
+            vfconv.i[1]     *= 4;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            velec            = _fjsp_mul_v2r8(qq11,VV);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq11,FF),_fjsp_mul_v2r8(vftabscale,rinv11)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+            
+            fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r12              = _fjsp_mul_v2r8(rsq12,rinv12);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r12,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 4;
+            vfconv.i[1]     *= 4;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            velec            = _fjsp_mul_v2r8(qq12,VV);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq12,FF),_fjsp_mul_v2r8(vftabscale,rinv12)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+            
+            fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r13              = _fjsp_mul_v2r8(rsq13,rinv13);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r13,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 4;
+            vfconv.i[1]     *= 4;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            velec            = _fjsp_mul_v2r8(qq13,VV);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq13,FF),_fjsp_mul_v2r8(vftabscale,rinv13)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx13,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy13,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz13,fscal,fiz1);
+            
+            fjx3             = _fjsp_madd_v2r8(dx13,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy13,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz13,fscal,fjz3);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r21              = _fjsp_mul_v2r8(rsq21,rinv21);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r21,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 4;
+            vfconv.i[1]     *= 4;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            velec            = _fjsp_mul_v2r8(qq21,VV);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq21,FF),_fjsp_mul_v2r8(vftabscale,rinv21)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+            
+            fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r22              = _fjsp_mul_v2r8(rsq22,rinv22);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r22,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 4;
+            vfconv.i[1]     *= 4;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            velec            = _fjsp_mul_v2r8(qq22,VV);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq22,FF),_fjsp_mul_v2r8(vftabscale,rinv22)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+            
+            fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r23              = _fjsp_mul_v2r8(rsq23,rinv23);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r23,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 4;
+            vfconv.i[1]     *= 4;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            velec            = _fjsp_mul_v2r8(qq23,VV);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq23,FF),_fjsp_mul_v2r8(vftabscale,rinv23)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx23,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy23,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz23,fscal,fiz2);
+            
+            fjx3             = _fjsp_madd_v2r8(dx23,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy23,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz23,fscal,fjz3);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r31              = _fjsp_mul_v2r8(rsq31,rinv31);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r31,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 4;
+            vfconv.i[1]     *= 4;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            velec            = _fjsp_mul_v2r8(qq31,VV);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq31,FF),_fjsp_mul_v2r8(vftabscale,rinv31)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx31,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy31,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz31,fscal,fiz3);
+            
+            fjx1             = _fjsp_madd_v2r8(dx31,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy31,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz31,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r32              = _fjsp_mul_v2r8(rsq32,rinv32);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r32,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 4;
+            vfconv.i[1]     *= 4;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            velec            = _fjsp_mul_v2r8(qq32,VV);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq32,FF),_fjsp_mul_v2r8(vftabscale,rinv32)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx32,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy32,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz32,fscal,fiz3);
+            
+            fjx2             = _fjsp_madd_v2r8(dx32,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy32,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz32,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r33              = _fjsp_mul_v2r8(rsq33,rinv33);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r33,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 4;
+            vfconv.i[1]     *= 4;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            velec            = _fjsp_mul_v2r8(qq33,VV);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq33,FF),_fjsp_mul_v2r8(vftabscale,rinv33)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx33,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy33,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz33,fscal,fiz3);
+            
+            fjx3             = _fjsp_madd_v2r8(dx33,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy33,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz33,fscal,fjz3);
+
+            gmx_fjsp_decrement_4rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+
+            /* Inner loop uses 452 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_fjsp_update_iforce_4atom_swizzle_v2r8(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        ggid                        = gid[iidx];
+        /* Update potential energies */
+        gmx_fjsp_update_1pot_v2r8(velecsum,kernel_data->energygrp_elec+ggid);
+        gmx_fjsp_update_1pot_v2r8(vvdwsum,kernel_data->energygrp_vdw+ggid);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 26 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_VF,outeriter*26 + inneriter*452);
+}
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecCSTab_VdwLJ_GeomW4W4_F_sparc64_hpc_ace_double
+ * Electrostatics interaction: CubicSplineTable
+ * VdW interaction:            LennardJones
+ * Geometry:                   Water4-Water4
+ * Calculate force/pot:        Force
+ */
+void
+nb_kernel_ElecCSTab_VdwLJ_GeomW4W4_F_sparc64_hpc_ace_double
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwioffset3;
+    _fjsp_v2r8       ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+    int              vdwjidx0A,vdwjidx0B;
+    _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    int              vdwjidx1A,vdwjidx1B;
+    _fjsp_v2r8       jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+    int              vdwjidx2A,vdwjidx2B;
+    _fjsp_v2r8       jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+    int              vdwjidx3A,vdwjidx3B;
+    _fjsp_v2r8       jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
+    _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    _fjsp_v2r8       dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+    _fjsp_v2r8       dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+    _fjsp_v2r8       dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
+    _fjsp_v2r8       dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+    _fjsp_v2r8       dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+    _fjsp_v2r8       dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
+    _fjsp_v2r8       dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
+    _fjsp_v2r8       dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
+    _fjsp_v2r8       dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
+    _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
+    _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
+    _fjsp_v2r8       rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF,twovfeps;
+    real             *vftab;
+    _fjsp_v2r8       itab_tmp;
+    _fjsp_v2r8       dummy_mask,cutoff_mask;
+    _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+    _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+    union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+
+    vftab            = kernel_data->table_elec->data;
+    vftabscale       = gmx_fjsp_set1_v2r8(kernel_data->table_elec->scale);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+    iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+    iq3              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+3]));
+    vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+    jq1              = gmx_fjsp_set1_v2r8(charge[inr+1]);
+    jq2              = gmx_fjsp_set1_v2r8(charge[inr+2]);
+    jq3              = gmx_fjsp_set1_v2r8(charge[inr+3]);
+    vdwjidx0A        = 2*vdwtype[inr+0];
+    c6_00            = gmx_fjsp_set1_v2r8(vdwparam[vdwioffset0+vdwjidx0A]);
+    c12_00           = gmx_fjsp_set1_v2r8(vdwparam[vdwioffset0+vdwjidx0A+1]);
+    qq11             = _fjsp_mul_v2r8(iq1,jq1);
+    qq12             = _fjsp_mul_v2r8(iq1,jq2);
+    qq13             = _fjsp_mul_v2r8(iq1,jq3);
+    qq21             = _fjsp_mul_v2r8(iq2,jq1);
+    qq22             = _fjsp_mul_v2r8(iq2,jq2);
+    qq23             = _fjsp_mul_v2r8(iq2,jq3);
+    qq31             = _fjsp_mul_v2r8(iq3,jq1);
+    qq32             = _fjsp_mul_v2r8(iq3,jq2);
+    qq33             = _fjsp_mul_v2r8(iq3,jq3);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_fjsp_load_shift_and_4rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
+
+        fix0             = _fjsp_setzero_v2r8();
+        fiy0             = _fjsp_setzero_v2r8();
+        fiz0             = _fjsp_setzero_v2r8();
+        fix1             = _fjsp_setzero_v2r8();
+        fiy1             = _fjsp_setzero_v2r8();
+        fiz1             = _fjsp_setzero_v2r8();
+        fix2             = _fjsp_setzero_v2r8();
+        fiy2             = _fjsp_setzero_v2r8();
+        fiz2             = _fjsp_setzero_v2r8();
+        fix3             = _fjsp_setzero_v2r8();
+        fiy3             = _fjsp_setzero_v2r8();
+        fiz3             = _fjsp_setzero_v2r8();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_4rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
+                                              &jy2,&jz2,&jx3,&jy3,&jz3);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx11             = _fjsp_sub_v2r8(ix1,jx1);
+            dy11             = _fjsp_sub_v2r8(iy1,jy1);
+            dz11             = _fjsp_sub_v2r8(iz1,jz1);
+            dx12             = _fjsp_sub_v2r8(ix1,jx2);
+            dy12             = _fjsp_sub_v2r8(iy1,jy2);
+            dz12             = _fjsp_sub_v2r8(iz1,jz2);
+            dx13             = _fjsp_sub_v2r8(ix1,jx3);
+            dy13             = _fjsp_sub_v2r8(iy1,jy3);
+            dz13             = _fjsp_sub_v2r8(iz1,jz3);
+            dx21             = _fjsp_sub_v2r8(ix2,jx1);
+            dy21             = _fjsp_sub_v2r8(iy2,jy1);
+            dz21             = _fjsp_sub_v2r8(iz2,jz1);
+            dx22             = _fjsp_sub_v2r8(ix2,jx2);
+            dy22             = _fjsp_sub_v2r8(iy2,jy2);
+            dz22             = _fjsp_sub_v2r8(iz2,jz2);
+            dx23             = _fjsp_sub_v2r8(ix2,jx3);
+            dy23             = _fjsp_sub_v2r8(iy2,jy3);
+            dz23             = _fjsp_sub_v2r8(iz2,jz3);
+            dx31             = _fjsp_sub_v2r8(ix3,jx1);
+            dy31             = _fjsp_sub_v2r8(iy3,jy1);
+            dz31             = _fjsp_sub_v2r8(iz3,jz1);
+            dx32             = _fjsp_sub_v2r8(ix3,jx2);
+            dy32             = _fjsp_sub_v2r8(iy3,jy2);
+            dz32             = _fjsp_sub_v2r8(iz3,jz2);
+            dx33             = _fjsp_sub_v2r8(ix3,jx3);
+            dy33             = _fjsp_sub_v2r8(iy3,jy3);
+            dz33             = _fjsp_sub_v2r8(iz3,jz3);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+            rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+            rsq13            = gmx_fjsp_calc_rsq_v2r8(dx13,dy13,dz13);
+            rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+            rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+            rsq23            = gmx_fjsp_calc_rsq_v2r8(dx23,dy23,dz23);
+            rsq31            = gmx_fjsp_calc_rsq_v2r8(dx31,dy31,dz31);
+            rsq32            = gmx_fjsp_calc_rsq_v2r8(dx32,dy32,dz32);
+            rsq33            = gmx_fjsp_calc_rsq_v2r8(dx33,dy33,dz33);
+
+            rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+            rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+            rinv13           = gmx_fjsp_invsqrt_v2r8(rsq13);
+            rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+            rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+            rinv23           = gmx_fjsp_invsqrt_v2r8(rsq23);
+            rinv31           = gmx_fjsp_invsqrt_v2r8(rsq31);
+            rinv32           = gmx_fjsp_invsqrt_v2r8(rsq32);
+            rinv33           = gmx_fjsp_invsqrt_v2r8(rsq33);
+
+            rinvsq00         = gmx_fjsp_inv_v2r8(rsq00);
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+            fjx1             = _fjsp_setzero_v2r8();
+            fjy1             = _fjsp_setzero_v2r8();
+            fjz1             = _fjsp_setzero_v2r8();
+            fjx2             = _fjsp_setzero_v2r8();
+            fjy2             = _fjsp_setzero_v2r8();
+            fjz2             = _fjsp_setzero_v2r8();
+            fjx3             = _fjsp_setzero_v2r8();
+            fjy3             = _fjsp_setzero_v2r8();
+            fjz3             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+            fvdw             = _fjsp_mul_v2r8(_fjsp_msub_v2r8(c12_00,rinvsix,c6_00),_fjsp_mul_v2r8(rinvsix,rinvsq00));
+
+            fscal            = fvdw;
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r11              = _fjsp_mul_v2r8(rsq11,rinv11);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r11,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 4;
+            vfconv.i[1]     *= 4;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq11,FF),_fjsp_mul_v2r8(vftabscale,rinv11)));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+            
+            fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r12              = _fjsp_mul_v2r8(rsq12,rinv12);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r12,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 4;
+            vfconv.i[1]     *= 4;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq12,FF),_fjsp_mul_v2r8(vftabscale,rinv12)));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+            
+            fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r13              = _fjsp_mul_v2r8(rsq13,rinv13);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r13,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 4;
+            vfconv.i[1]     *= 4;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq13,FF),_fjsp_mul_v2r8(vftabscale,rinv13)));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx13,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy13,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz13,fscal,fiz1);
+            
+            fjx3             = _fjsp_madd_v2r8(dx13,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy13,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz13,fscal,fjz3);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r21              = _fjsp_mul_v2r8(rsq21,rinv21);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r21,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 4;
+            vfconv.i[1]     *= 4;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq21,FF),_fjsp_mul_v2r8(vftabscale,rinv21)));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+            
+            fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r22              = _fjsp_mul_v2r8(rsq22,rinv22);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r22,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 4;
+            vfconv.i[1]     *= 4;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq22,FF),_fjsp_mul_v2r8(vftabscale,rinv22)));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+            
+            fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r23              = _fjsp_mul_v2r8(rsq23,rinv23);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r23,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 4;
+            vfconv.i[1]     *= 4;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq23,FF),_fjsp_mul_v2r8(vftabscale,rinv23)));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx23,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy23,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz23,fscal,fiz2);
+            
+            fjx3             = _fjsp_madd_v2r8(dx23,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy23,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz23,fscal,fjz3);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r31              = _fjsp_mul_v2r8(rsq31,rinv31);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r31,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 4;
+            vfconv.i[1]     *= 4;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq31,FF),_fjsp_mul_v2r8(vftabscale,rinv31)));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx31,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy31,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz31,fscal,fiz3);
+            
+            fjx1             = _fjsp_madd_v2r8(dx31,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy31,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz31,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r32              = _fjsp_mul_v2r8(rsq32,rinv32);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r32,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 4;
+            vfconv.i[1]     *= 4;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq32,FF),_fjsp_mul_v2r8(vftabscale,rinv32)));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx32,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy32,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz32,fscal,fiz3);
+            
+            fjx2             = _fjsp_madd_v2r8(dx32,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy32,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz32,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r33              = _fjsp_mul_v2r8(rsq33,rinv33);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r33,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 4;
+            vfconv.i[1]     *= 4;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq33,FF),_fjsp_mul_v2r8(vftabscale,rinv33)));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx33,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy33,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz33,fscal,fiz3);
+            
+            fjx3             = _fjsp_madd_v2r8(dx33,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy33,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz33,fscal,fjz3);
+
+            gmx_fjsp_decrement_4rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+
+            /* Inner loop uses 411 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_4rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
+                                              &jy2,&jz2,&jx3,&jy3,&jz3);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx11             = _fjsp_sub_v2r8(ix1,jx1);
+            dy11             = _fjsp_sub_v2r8(iy1,jy1);
+            dz11             = _fjsp_sub_v2r8(iz1,jz1);
+            dx12             = _fjsp_sub_v2r8(ix1,jx2);
+            dy12             = _fjsp_sub_v2r8(iy1,jy2);
+            dz12             = _fjsp_sub_v2r8(iz1,jz2);
+            dx13             = _fjsp_sub_v2r8(ix1,jx3);
+            dy13             = _fjsp_sub_v2r8(iy1,jy3);
+            dz13             = _fjsp_sub_v2r8(iz1,jz3);
+            dx21             = _fjsp_sub_v2r8(ix2,jx1);
+            dy21             = _fjsp_sub_v2r8(iy2,jy1);
+            dz21             = _fjsp_sub_v2r8(iz2,jz1);
+            dx22             = _fjsp_sub_v2r8(ix2,jx2);
+            dy22             = _fjsp_sub_v2r8(iy2,jy2);
+            dz22             = _fjsp_sub_v2r8(iz2,jz2);
+            dx23             = _fjsp_sub_v2r8(ix2,jx3);
+            dy23             = _fjsp_sub_v2r8(iy2,jy3);
+            dz23             = _fjsp_sub_v2r8(iz2,jz3);
+            dx31             = _fjsp_sub_v2r8(ix3,jx1);
+            dy31             = _fjsp_sub_v2r8(iy3,jy1);
+            dz31             = _fjsp_sub_v2r8(iz3,jz1);
+            dx32             = _fjsp_sub_v2r8(ix3,jx2);
+            dy32             = _fjsp_sub_v2r8(iy3,jy2);
+            dz32             = _fjsp_sub_v2r8(iz3,jz2);
+            dx33             = _fjsp_sub_v2r8(ix3,jx3);
+            dy33             = _fjsp_sub_v2r8(iy3,jy3);
+            dz33             = _fjsp_sub_v2r8(iz3,jz3);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+            rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+            rsq13            = gmx_fjsp_calc_rsq_v2r8(dx13,dy13,dz13);
+            rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+            rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+            rsq23            = gmx_fjsp_calc_rsq_v2r8(dx23,dy23,dz23);
+            rsq31            = gmx_fjsp_calc_rsq_v2r8(dx31,dy31,dz31);
+            rsq32            = gmx_fjsp_calc_rsq_v2r8(dx32,dy32,dz32);
+            rsq33            = gmx_fjsp_calc_rsq_v2r8(dx33,dy33,dz33);
+
+            rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+            rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+            rinv13           = gmx_fjsp_invsqrt_v2r8(rsq13);
+            rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+            rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+            rinv23           = gmx_fjsp_invsqrt_v2r8(rsq23);
+            rinv31           = gmx_fjsp_invsqrt_v2r8(rsq31);
+            rinv32           = gmx_fjsp_invsqrt_v2r8(rsq32);
+            rinv33           = gmx_fjsp_invsqrt_v2r8(rsq33);
+
+            rinvsq00         = gmx_fjsp_inv_v2r8(rsq00);
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+            fjx1             = _fjsp_setzero_v2r8();
+            fjy1             = _fjsp_setzero_v2r8();
+            fjz1             = _fjsp_setzero_v2r8();
+            fjx2             = _fjsp_setzero_v2r8();
+            fjy2             = _fjsp_setzero_v2r8();
+            fjz2             = _fjsp_setzero_v2r8();
+            fjx3             = _fjsp_setzero_v2r8();
+            fjy3             = _fjsp_setzero_v2r8();
+            fjz3             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+            fvdw             = _fjsp_mul_v2r8(_fjsp_msub_v2r8(c12_00,rinvsix,c6_00),_fjsp_mul_v2r8(rinvsix,rinvsq00));
+
+            fscal            = fvdw;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r11              = _fjsp_mul_v2r8(rsq11,rinv11);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r11,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 4;
+            vfconv.i[1]     *= 4;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq11,FF),_fjsp_mul_v2r8(vftabscale,rinv11)));
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+            
+            fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r12              = _fjsp_mul_v2r8(rsq12,rinv12);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r12,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 4;
+            vfconv.i[1]     *= 4;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq12,FF),_fjsp_mul_v2r8(vftabscale,rinv12)));
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+            
+            fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r13              = _fjsp_mul_v2r8(rsq13,rinv13);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r13,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 4;
+            vfconv.i[1]     *= 4;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq13,FF),_fjsp_mul_v2r8(vftabscale,rinv13)));
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx13,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy13,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz13,fscal,fiz1);
+            
+            fjx3             = _fjsp_madd_v2r8(dx13,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy13,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz13,fscal,fjz3);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r21              = _fjsp_mul_v2r8(rsq21,rinv21);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r21,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 4;
+            vfconv.i[1]     *= 4;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq21,FF),_fjsp_mul_v2r8(vftabscale,rinv21)));
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+            
+            fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r22              = _fjsp_mul_v2r8(rsq22,rinv22);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r22,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 4;
+            vfconv.i[1]     *= 4;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq22,FF),_fjsp_mul_v2r8(vftabscale,rinv22)));
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+            
+            fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r23              = _fjsp_mul_v2r8(rsq23,rinv23);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r23,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 4;
+            vfconv.i[1]     *= 4;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq23,FF),_fjsp_mul_v2r8(vftabscale,rinv23)));
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx23,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy23,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz23,fscal,fiz2);
+            
+            fjx3             = _fjsp_madd_v2r8(dx23,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy23,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz23,fscal,fjz3);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r31              = _fjsp_mul_v2r8(rsq31,rinv31);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r31,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 4;
+            vfconv.i[1]     *= 4;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq31,FF),_fjsp_mul_v2r8(vftabscale,rinv31)));
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx31,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy31,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz31,fscal,fiz3);
+            
+            fjx1             = _fjsp_madd_v2r8(dx31,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy31,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz31,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r32              = _fjsp_mul_v2r8(rsq32,rinv32);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r32,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 4;
+            vfconv.i[1]     *= 4;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq32,FF),_fjsp_mul_v2r8(vftabscale,rinv32)));
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx32,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy32,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz32,fscal,fiz3);
+            
+            fjx2             = _fjsp_madd_v2r8(dx32,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy32,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz32,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r33              = _fjsp_mul_v2r8(rsq33,rinv33);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r33,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 4;
+            vfconv.i[1]     *= 4;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq33,FF),_fjsp_mul_v2r8(vftabscale,rinv33)));
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx33,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy33,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz33,fscal,fiz3);
+            
+            fjx3             = _fjsp_madd_v2r8(dx33,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy33,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz33,fscal,fjz3);
+
+            gmx_fjsp_decrement_4rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+
+            /* Inner loop uses 411 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_fjsp_update_iforce_4atom_swizzle_v2r8(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 24 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_F,outeriter*24 + inneriter*411);
+}
diff --git a/src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecCSTab_VdwNone_GeomP1P1_sparc64_hpc_ace_double.c b/src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecCSTab_VdwNone_GeomP1P1_sparc64_hpc_ace_double.c
new file mode 100644 (file)
index 0000000..d12f4a4
--- /dev/null
@@ -0,0 +1,564 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2012, by the GROMACS development team, led by
+ * David van der Spoel, Berk Hess, Erik Lindahl, and including many
+ * others, as listed in the AUTHORS file in the top-level source
+ * directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*
+ * Note: this file was generated by the GROMACS sparc64_hpc_ace_double kernel generator.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "kernelutil_sparc64_hpc_ace_double.h"
+
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecCSTab_VdwNone_GeomP1P1_VF_sparc64_hpc_ace_double
+ * Electrostatics interaction: CubicSplineTable
+ * VdW interaction:            None
+ * Geometry:                   Particle-Particle
+ * Calculate force/pot:        PotentialAndForce
+ */
+void
+nb_kernel_ElecCSTab_VdwNone_GeomP1P1_VF_sparc64_hpc_ace_double
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwjidx0A,vdwjidx0B;
+    _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    _fjsp_v2r8       rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF,twovfeps;
+    real             *vftab;
+    _fjsp_v2r8       itab_tmp;
+    _fjsp_v2r8       dummy_mask,cutoff_mask;
+    _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+    _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+    union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+    charge           = mdatoms->chargeA;
+
+    vftab            = kernel_data->table_elec->data;
+    vftabscale       = gmx_fjsp_set1_v2r8(kernel_data->table_elec->scale);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_fjsp_load_shift_and_1rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,&ix0,&iy0,&iz0);
+
+        fix0             = _fjsp_setzero_v2r8();
+        fiy0             = _fjsp_setzero_v2r8();
+        fiz0             = _fjsp_setzero_v2r8();
+
+        /* Load parameters for i particles */
+        iq0              = _fjsp_mul_v2r8(facel,gmx_fjsp_load1_v2r8(charge+inr+0));
+
+        /* Reset potential sums */
+        velecsum         = _fjsp_setzero_v2r8();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_fjsp_load_2real_swizzle_v2r8(charge+jnrA+0,charge+jnrB+0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _fjsp_mul_v2r8(iq0,jq0);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r00,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 4;
+            vfconv.i[1]     *= 4;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            velec            = _fjsp_mul_v2r8(qq00,VV);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq00,FF),_fjsp_mul_v2r8(vftabscale,rinv00)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            gmx_fjsp_decrement_fma_1rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fscal,dx00,dy00,dz00);
+
+            /* Inner loop uses 46 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+
+            /* Load parameters for j particles */
+            jq0              = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),charge+jnrA+0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _fjsp_mul_v2r8(iq0,jq0);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r00,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 4;
+            vfconv.i[1]     *= 4;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            velec            = _fjsp_mul_v2r8(qq00,VV);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq00,FF),_fjsp_mul_v2r8(vftabscale,rinv00)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            gmx_fjsp_decrement_fma_1rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fscal,dx00,dy00,dz00);
+
+            /* Inner loop uses 46 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_fjsp_update_iforce_1atom_swizzle_v2r8(fix0,fiy0,fiz0,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        ggid                        = gid[iidx];
+        /* Update potential energies */
+        gmx_fjsp_update_1pot_v2r8(velecsum,kernel_data->energygrp_elec+ggid);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 8 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VF,outeriter*8 + inneriter*46);
+}
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecCSTab_VdwNone_GeomP1P1_F_sparc64_hpc_ace_double
+ * Electrostatics interaction: CubicSplineTable
+ * VdW interaction:            None
+ * Geometry:                   Particle-Particle
+ * Calculate force/pot:        Force
+ */
+void
+nb_kernel_ElecCSTab_VdwNone_GeomP1P1_F_sparc64_hpc_ace_double
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwjidx0A,vdwjidx0B;
+    _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    _fjsp_v2r8       rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF,twovfeps;
+    real             *vftab;
+    _fjsp_v2r8       itab_tmp;
+    _fjsp_v2r8       dummy_mask,cutoff_mask;
+    _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+    _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+    union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+    charge           = mdatoms->chargeA;
+
+    vftab            = kernel_data->table_elec->data;
+    vftabscale       = gmx_fjsp_set1_v2r8(kernel_data->table_elec->scale);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_fjsp_load_shift_and_1rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,&ix0,&iy0,&iz0);
+
+        fix0             = _fjsp_setzero_v2r8();
+        fiy0             = _fjsp_setzero_v2r8();
+        fiz0             = _fjsp_setzero_v2r8();
+
+        /* Load parameters for i particles */
+        iq0              = _fjsp_mul_v2r8(facel,gmx_fjsp_load1_v2r8(charge+inr+0));
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_fjsp_load_2real_swizzle_v2r8(charge+jnrA+0,charge+jnrB+0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _fjsp_mul_v2r8(iq0,jq0);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r00,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 4;
+            vfconv.i[1]     *= 4;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq00,FF),_fjsp_mul_v2r8(vftabscale,rinv00)));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            gmx_fjsp_decrement_fma_1rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fscal,dx00,dy00,dz00);
+
+            /* Inner loop uses 42 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+
+            /* Load parameters for j particles */
+            jq0              = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),charge+jnrA+0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _fjsp_mul_v2r8(iq0,jq0);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r00,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 4;
+            vfconv.i[1]     *= 4;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq00,FF),_fjsp_mul_v2r8(vftabscale,rinv00)));
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            gmx_fjsp_decrement_fma_1rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fscal,dx00,dy00,dz00);
+
+            /* Inner loop uses 42 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_fjsp_update_iforce_1atom_swizzle_v2r8(fix0,fiy0,fiz0,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 7 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_F,outeriter*7 + inneriter*42);
+}
diff --git a/src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecCSTab_VdwNone_GeomW3P1_sparc64_hpc_ace_double.c b/src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecCSTab_VdwNone_GeomW3P1_sparc64_hpc_ace_double.c
new file mode 100644 (file)
index 0000000..6f93895
--- /dev/null
@@ -0,0 +1,1026 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2012, by the GROMACS development team, led by
+ * David van der Spoel, Berk Hess, Erik Lindahl, and including many
+ * others, as listed in the AUTHORS file in the top-level source
+ * directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*
+ * Note: this file was generated by the GROMACS sparc64_hpc_ace_double kernel generator.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "kernelutil_sparc64_hpc_ace_double.h"
+
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecCSTab_VdwNone_GeomW3P1_VF_sparc64_hpc_ace_double
+ * Electrostatics interaction: CubicSplineTable
+ * VdW interaction:            None
+ * Geometry:                   Water3-Particle
+ * Calculate force/pot:        PotentialAndForce
+ */
+void
+nb_kernel_ElecCSTab_VdwNone_GeomW3P1_VF_sparc64_hpc_ace_double
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwjidx0A,vdwjidx0B;
+    _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    _fjsp_v2r8       dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    _fjsp_v2r8       dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    _fjsp_v2r8       rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF,twovfeps;
+    real             *vftab;
+    _fjsp_v2r8       itab_tmp;
+    _fjsp_v2r8       dummy_mask,cutoff_mask;
+    _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+    _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+    union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+    charge           = mdatoms->chargeA;
+
+    vftab            = kernel_data->table_elec->data;
+    vftabscale       = gmx_fjsp_set1_v2r8(kernel_data->table_elec->scale);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq0              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+0]));
+    iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+    iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_fjsp_load_shift_and_3rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
+
+        fix0             = _fjsp_setzero_v2r8();
+        fiy0             = _fjsp_setzero_v2r8();
+        fiz0             = _fjsp_setzero_v2r8();
+        fix1             = _fjsp_setzero_v2r8();
+        fiy1             = _fjsp_setzero_v2r8();
+        fiz1             = _fjsp_setzero_v2r8();
+        fix2             = _fjsp_setzero_v2r8();
+        fiy2             = _fjsp_setzero_v2r8();
+        fiz2             = _fjsp_setzero_v2r8();
+
+        /* Reset potential sums */
+        velecsum         = _fjsp_setzero_v2r8();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_fjsp_load_2real_swizzle_v2r8(charge+jnrA+0,charge+jnrB+0);
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _fjsp_mul_v2r8(iq0,jq0);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r00,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 4;
+            vfconv.i[1]     *= 4;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            velec            = _fjsp_mul_v2r8(qq00,VV);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq00,FF),_fjsp_mul_v2r8(vftabscale,rinv00)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r10              = _fjsp_mul_v2r8(rsq10,rinv10);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _fjsp_mul_v2r8(iq1,jq0);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r10,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 4;
+            vfconv.i[1]     *= 4;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            velec            = _fjsp_mul_v2r8(qq10,VV);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq10,FF),_fjsp_mul_v2r8(vftabscale,rinv10)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r20              = _fjsp_mul_v2r8(rsq20,rinv20);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _fjsp_mul_v2r8(iq2,jq0);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r20,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 4;
+            vfconv.i[1]     *= 4;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            velec            = _fjsp_mul_v2r8(qq20,VV);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq20,FF),_fjsp_mul_v2r8(vftabscale,rinv20)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            gmx_fjsp_decrement_1rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 141 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+
+            /* Load parameters for j particles */
+            jq0              = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),charge+jnrA+0);
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _fjsp_mul_v2r8(iq0,jq0);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r00,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 4;
+            vfconv.i[1]     *= 4;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            velec            = _fjsp_mul_v2r8(qq00,VV);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq00,FF),_fjsp_mul_v2r8(vftabscale,rinv00)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r10              = _fjsp_mul_v2r8(rsq10,rinv10);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _fjsp_mul_v2r8(iq1,jq0);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r10,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 4;
+            vfconv.i[1]     *= 4;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            velec            = _fjsp_mul_v2r8(qq10,VV);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq10,FF),_fjsp_mul_v2r8(vftabscale,rinv10)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r20              = _fjsp_mul_v2r8(rsq20,rinv20);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _fjsp_mul_v2r8(iq2,jq0);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r20,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 4;
+            vfconv.i[1]     *= 4;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            velec            = _fjsp_mul_v2r8(qq20,VV);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq20,FF),_fjsp_mul_v2r8(vftabscale,rinv20)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            gmx_fjsp_decrement_1rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 141 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_fjsp_update_iforce_3atom_swizzle_v2r8(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        ggid                        = gid[iidx];
+        /* Update potential energies */
+        gmx_fjsp_update_1pot_v2r8(velecsum,kernel_data->energygrp_elec+ggid);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 19 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W3_VF,outeriter*19 + inneriter*141);
+}
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecCSTab_VdwNone_GeomW3P1_F_sparc64_hpc_ace_double
+ * Electrostatics interaction: CubicSplineTable
+ * VdW interaction:            None
+ * Geometry:                   Water3-Particle
+ * Calculate force/pot:        Force
+ */
+void
+nb_kernel_ElecCSTab_VdwNone_GeomW3P1_F_sparc64_hpc_ace_double
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwjidx0A,vdwjidx0B;
+    _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    _fjsp_v2r8       dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    _fjsp_v2r8       dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    _fjsp_v2r8       rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF,twovfeps;
+    real             *vftab;
+    _fjsp_v2r8       itab_tmp;
+    _fjsp_v2r8       dummy_mask,cutoff_mask;
+    _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+    _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+    union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+    charge           = mdatoms->chargeA;
+
+    vftab            = kernel_data->table_elec->data;
+    vftabscale       = gmx_fjsp_set1_v2r8(kernel_data->table_elec->scale);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq0              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+0]));
+    iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+    iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_fjsp_load_shift_and_3rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
+
+        fix0             = _fjsp_setzero_v2r8();
+        fiy0             = _fjsp_setzero_v2r8();
+        fiz0             = _fjsp_setzero_v2r8();
+        fix1             = _fjsp_setzero_v2r8();
+        fiy1             = _fjsp_setzero_v2r8();
+        fiz1             = _fjsp_setzero_v2r8();
+        fix2             = _fjsp_setzero_v2r8();
+        fiy2             = _fjsp_setzero_v2r8();
+        fiz2             = _fjsp_setzero_v2r8();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_fjsp_load_2real_swizzle_v2r8(charge+jnrA+0,charge+jnrB+0);
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _fjsp_mul_v2r8(iq0,jq0);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r00,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 4;
+            vfconv.i[1]     *= 4;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq00,FF),_fjsp_mul_v2r8(vftabscale,rinv00)));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r10              = _fjsp_mul_v2r8(rsq10,rinv10);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _fjsp_mul_v2r8(iq1,jq0);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r10,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 4;
+            vfconv.i[1]     *= 4;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq10,FF),_fjsp_mul_v2r8(vftabscale,rinv10)));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r20              = _fjsp_mul_v2r8(rsq20,rinv20);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _fjsp_mul_v2r8(iq2,jq0);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r20,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 4;
+            vfconv.i[1]     *= 4;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq20,FF),_fjsp_mul_v2r8(vftabscale,rinv20)));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            gmx_fjsp_decrement_1rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 129 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+
+            /* Load parameters for j particles */
+            jq0              = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),charge+jnrA+0);
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _fjsp_mul_v2r8(iq0,jq0);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r00,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 4;
+            vfconv.i[1]     *= 4;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq00,FF),_fjsp_mul_v2r8(vftabscale,rinv00)));
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r10              = _fjsp_mul_v2r8(rsq10,rinv10);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _fjsp_mul_v2r8(iq1,jq0);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r10,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 4;
+            vfconv.i[1]     *= 4;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq10,FF),_fjsp_mul_v2r8(vftabscale,rinv10)));
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r20              = _fjsp_mul_v2r8(rsq20,rinv20);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _fjsp_mul_v2r8(iq2,jq0);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r20,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 4;
+            vfconv.i[1]     *= 4;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq20,FF),_fjsp_mul_v2r8(vftabscale,rinv20)));
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            gmx_fjsp_decrement_1rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 129 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_fjsp_update_iforce_3atom_swizzle_v2r8(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 18 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W3_F,outeriter*18 + inneriter*129);
+}
diff --git a/src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecCSTab_VdwNone_GeomW3W3_sparc64_hpc_ace_double.c b/src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecCSTab_VdwNone_GeomW3W3_sparc64_hpc_ace_double.c
new file mode 100644 (file)
index 0000000..d5e0083
--- /dev/null
@@ -0,0 +1,2170 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2012, by the GROMACS development team, led by
+ * David van der Spoel, Berk Hess, Erik Lindahl, and including many
+ * others, as listed in the AUTHORS file in the top-level source
+ * directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*
+ * Note: this file was generated by the GROMACS sparc64_hpc_ace_double kernel generator.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "kernelutil_sparc64_hpc_ace_double.h"
+
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecCSTab_VdwNone_GeomW3W3_VF_sparc64_hpc_ace_double
+ * Electrostatics interaction: CubicSplineTable
+ * VdW interaction:            None
+ * Geometry:                   Water3-Water3
+ * Calculate force/pot:        PotentialAndForce
+ */
+void
+nb_kernel_ElecCSTab_VdwNone_GeomW3W3_VF_sparc64_hpc_ace_double
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwjidx0A,vdwjidx0B;
+    _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    int              vdwjidx1A,vdwjidx1B;
+    _fjsp_v2r8       jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+    int              vdwjidx2A,vdwjidx2B;
+    _fjsp_v2r8       jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+    _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    _fjsp_v2r8       dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
+    _fjsp_v2r8       dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
+    _fjsp_v2r8       dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    _fjsp_v2r8       dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+    _fjsp_v2r8       dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+    _fjsp_v2r8       dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    _fjsp_v2r8       dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+    _fjsp_v2r8       dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+    _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    _fjsp_v2r8       rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF,twovfeps;
+    real             *vftab;
+    _fjsp_v2r8       itab_tmp;
+    _fjsp_v2r8       dummy_mask,cutoff_mask;
+    _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+    _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+    union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+    charge           = mdatoms->chargeA;
+
+    vftab            = kernel_data->table_elec->data;
+    vftabscale       = gmx_fjsp_set1_v2r8(kernel_data->table_elec->scale);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq0              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+0]));
+    iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+    iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+
+    jq0              = gmx_fjsp_set1_v2r8(charge[inr+0]);
+    jq1              = gmx_fjsp_set1_v2r8(charge[inr+1]);
+    jq2              = gmx_fjsp_set1_v2r8(charge[inr+2]);
+    qq00             = _fjsp_mul_v2r8(iq0,jq0);
+    qq01             = _fjsp_mul_v2r8(iq0,jq1);
+    qq02             = _fjsp_mul_v2r8(iq0,jq2);
+    qq10             = _fjsp_mul_v2r8(iq1,jq0);
+    qq11             = _fjsp_mul_v2r8(iq1,jq1);
+    qq12             = _fjsp_mul_v2r8(iq1,jq2);
+    qq20             = _fjsp_mul_v2r8(iq2,jq0);
+    qq21             = _fjsp_mul_v2r8(iq2,jq1);
+    qq22             = _fjsp_mul_v2r8(iq2,jq2);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_fjsp_load_shift_and_3rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
+
+        fix0             = _fjsp_setzero_v2r8();
+        fiy0             = _fjsp_setzero_v2r8();
+        fiz0             = _fjsp_setzero_v2r8();
+        fix1             = _fjsp_setzero_v2r8();
+        fiy1             = _fjsp_setzero_v2r8();
+        fiz1             = _fjsp_setzero_v2r8();
+        fix2             = _fjsp_setzero_v2r8();
+        fiy2             = _fjsp_setzero_v2r8();
+        fiz2             = _fjsp_setzero_v2r8();
+
+        /* Reset potential sums */
+        velecsum         = _fjsp_setzero_v2r8();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_3rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx01             = _fjsp_sub_v2r8(ix0,jx1);
+            dy01             = _fjsp_sub_v2r8(iy0,jy1);
+            dz01             = _fjsp_sub_v2r8(iz0,jz1);
+            dx02             = _fjsp_sub_v2r8(ix0,jx2);
+            dy02             = _fjsp_sub_v2r8(iy0,jy2);
+            dz02             = _fjsp_sub_v2r8(iz0,jz2);
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx11             = _fjsp_sub_v2r8(ix1,jx1);
+            dy11             = _fjsp_sub_v2r8(iy1,jy1);
+            dz11             = _fjsp_sub_v2r8(iz1,jz1);
+            dx12             = _fjsp_sub_v2r8(ix1,jx2);
+            dy12             = _fjsp_sub_v2r8(iy1,jy2);
+            dz12             = _fjsp_sub_v2r8(iz1,jz2);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+            dx21             = _fjsp_sub_v2r8(ix2,jx1);
+            dy21             = _fjsp_sub_v2r8(iy2,jy1);
+            dz21             = _fjsp_sub_v2r8(iz2,jz1);
+            dx22             = _fjsp_sub_v2r8(ix2,jx2);
+            dy22             = _fjsp_sub_v2r8(iy2,jy2);
+            dz22             = _fjsp_sub_v2r8(iz2,jz2);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq01            = gmx_fjsp_calc_rsq_v2r8(dx01,dy01,dz01);
+            rsq02            = gmx_fjsp_calc_rsq_v2r8(dx02,dy02,dz02);
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+            rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+            rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+            rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv01           = gmx_fjsp_invsqrt_v2r8(rsq01);
+            rinv02           = gmx_fjsp_invsqrt_v2r8(rsq02);
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+            rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+            rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+            rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+            fjx1             = _fjsp_setzero_v2r8();
+            fjy1             = _fjsp_setzero_v2r8();
+            fjz1             = _fjsp_setzero_v2r8();
+            fjx2             = _fjsp_setzero_v2r8();
+            fjy2             = _fjsp_setzero_v2r8();
+            fjz2             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r00,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 4;
+            vfconv.i[1]     *= 4;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            velec            = _fjsp_mul_v2r8(qq00,VV);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq00,FF),_fjsp_mul_v2r8(vftabscale,rinv00)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r01              = _fjsp_mul_v2r8(rsq01,rinv01);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r01,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 4;
+            vfconv.i[1]     *= 4;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            velec            = _fjsp_mul_v2r8(qq01,VV);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq01,FF),_fjsp_mul_v2r8(vftabscale,rinv01)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx01,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy01,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz01,fscal,fiz0);
+            
+            fjx1             = _fjsp_madd_v2r8(dx01,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy01,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz01,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r02              = _fjsp_mul_v2r8(rsq02,rinv02);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r02,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 4;
+            vfconv.i[1]     *= 4;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            velec            = _fjsp_mul_v2r8(qq02,VV);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq02,FF),_fjsp_mul_v2r8(vftabscale,rinv02)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx02,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy02,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz02,fscal,fiz0);
+            
+            fjx2             = _fjsp_madd_v2r8(dx02,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy02,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz02,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r10              = _fjsp_mul_v2r8(rsq10,rinv10);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r10,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 4;
+            vfconv.i[1]     *= 4;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            velec            = _fjsp_mul_v2r8(qq10,VV);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq10,FF),_fjsp_mul_v2r8(vftabscale,rinv10)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r11              = _fjsp_mul_v2r8(rsq11,rinv11);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r11,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 4;
+            vfconv.i[1]     *= 4;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            velec            = _fjsp_mul_v2r8(qq11,VV);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq11,FF),_fjsp_mul_v2r8(vftabscale,rinv11)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+            
+            fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r12              = _fjsp_mul_v2r8(rsq12,rinv12);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r12,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 4;
+            vfconv.i[1]     *= 4;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            velec            = _fjsp_mul_v2r8(qq12,VV);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq12,FF),_fjsp_mul_v2r8(vftabscale,rinv12)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+            
+            fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r20              = _fjsp_mul_v2r8(rsq20,rinv20);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r20,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 4;
+            vfconv.i[1]     *= 4;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            velec            = _fjsp_mul_v2r8(qq20,VV);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq20,FF),_fjsp_mul_v2r8(vftabscale,rinv20)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r21              = _fjsp_mul_v2r8(rsq21,rinv21);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r21,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 4;
+            vfconv.i[1]     *= 4;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            velec            = _fjsp_mul_v2r8(qq21,VV);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq21,FF),_fjsp_mul_v2r8(vftabscale,rinv21)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+            
+            fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r22              = _fjsp_mul_v2r8(rsq22,rinv22);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r22,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 4;
+            vfconv.i[1]     *= 4;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            velec            = _fjsp_mul_v2r8(qq22,VV);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq22,FF),_fjsp_mul_v2r8(vftabscale,rinv22)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+            
+            fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+
+            gmx_fjsp_decrement_3rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+
+            /* Inner loop uses 414 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_3rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx01             = _fjsp_sub_v2r8(ix0,jx1);
+            dy01             = _fjsp_sub_v2r8(iy0,jy1);
+            dz01             = _fjsp_sub_v2r8(iz0,jz1);
+            dx02             = _fjsp_sub_v2r8(ix0,jx2);
+            dy02             = _fjsp_sub_v2r8(iy0,jy2);
+            dz02             = _fjsp_sub_v2r8(iz0,jz2);
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx11             = _fjsp_sub_v2r8(ix1,jx1);
+            dy11             = _fjsp_sub_v2r8(iy1,jy1);
+            dz11             = _fjsp_sub_v2r8(iz1,jz1);
+            dx12             = _fjsp_sub_v2r8(ix1,jx2);
+            dy12             = _fjsp_sub_v2r8(iy1,jy2);
+            dz12             = _fjsp_sub_v2r8(iz1,jz2);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+            dx21             = _fjsp_sub_v2r8(ix2,jx1);
+            dy21             = _fjsp_sub_v2r8(iy2,jy1);
+            dz21             = _fjsp_sub_v2r8(iz2,jz1);
+            dx22             = _fjsp_sub_v2r8(ix2,jx2);
+            dy22             = _fjsp_sub_v2r8(iy2,jy2);
+            dz22             = _fjsp_sub_v2r8(iz2,jz2);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq01            = gmx_fjsp_calc_rsq_v2r8(dx01,dy01,dz01);
+            rsq02            = gmx_fjsp_calc_rsq_v2r8(dx02,dy02,dz02);
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+            rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+            rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+            rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv01           = gmx_fjsp_invsqrt_v2r8(rsq01);
+            rinv02           = gmx_fjsp_invsqrt_v2r8(rsq02);
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+            rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+            rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+            rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+            fjx1             = _fjsp_setzero_v2r8();
+            fjy1             = _fjsp_setzero_v2r8();
+            fjz1             = _fjsp_setzero_v2r8();
+            fjx2             = _fjsp_setzero_v2r8();
+            fjy2             = _fjsp_setzero_v2r8();
+            fjz2             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r00,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 4;
+            vfconv.i[1]     *= 4;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            velec            = _fjsp_mul_v2r8(qq00,VV);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq00,FF),_fjsp_mul_v2r8(vftabscale,rinv00)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r01              = _fjsp_mul_v2r8(rsq01,rinv01);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r01,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 4;
+            vfconv.i[1]     *= 4;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            velec            = _fjsp_mul_v2r8(qq01,VV);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq01,FF),_fjsp_mul_v2r8(vftabscale,rinv01)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx01,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy01,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz01,fscal,fiz0);
+            
+            fjx1             = _fjsp_madd_v2r8(dx01,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy01,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz01,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r02              = _fjsp_mul_v2r8(rsq02,rinv02);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r02,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 4;
+            vfconv.i[1]     *= 4;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            velec            = _fjsp_mul_v2r8(qq02,VV);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq02,FF),_fjsp_mul_v2r8(vftabscale,rinv02)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx02,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy02,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz02,fscal,fiz0);
+            
+            fjx2             = _fjsp_madd_v2r8(dx02,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy02,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz02,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r10              = _fjsp_mul_v2r8(rsq10,rinv10);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r10,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 4;
+            vfconv.i[1]     *= 4;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            velec            = _fjsp_mul_v2r8(qq10,VV);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq10,FF),_fjsp_mul_v2r8(vftabscale,rinv10)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r11              = _fjsp_mul_v2r8(rsq11,rinv11);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r11,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 4;
+            vfconv.i[1]     *= 4;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            velec            = _fjsp_mul_v2r8(qq11,VV);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq11,FF),_fjsp_mul_v2r8(vftabscale,rinv11)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+            
+            fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r12              = _fjsp_mul_v2r8(rsq12,rinv12);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r12,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 4;
+            vfconv.i[1]     *= 4;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            velec            = _fjsp_mul_v2r8(qq12,VV);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq12,FF),_fjsp_mul_v2r8(vftabscale,rinv12)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+            
+            fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r20              = _fjsp_mul_v2r8(rsq20,rinv20);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r20,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 4;
+            vfconv.i[1]     *= 4;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            velec            = _fjsp_mul_v2r8(qq20,VV);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq20,FF),_fjsp_mul_v2r8(vftabscale,rinv20)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r21              = _fjsp_mul_v2r8(rsq21,rinv21);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r21,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 4;
+            vfconv.i[1]     *= 4;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            velec            = _fjsp_mul_v2r8(qq21,VV);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq21,FF),_fjsp_mul_v2r8(vftabscale,rinv21)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+            
+            fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r22              = _fjsp_mul_v2r8(rsq22,rinv22);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r22,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 4;
+            vfconv.i[1]     *= 4;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            velec            = _fjsp_mul_v2r8(qq22,VV);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq22,FF),_fjsp_mul_v2r8(vftabscale,rinv22)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+            
+            fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+
+            gmx_fjsp_decrement_3rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+
+            /* Inner loop uses 414 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_fjsp_update_iforce_3atom_swizzle_v2r8(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        ggid                        = gid[iidx];
+        /* Update potential energies */
+        gmx_fjsp_update_1pot_v2r8(velecsum,kernel_data->energygrp_elec+ggid);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 19 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W3W3_VF,outeriter*19 + inneriter*414);
+}
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecCSTab_VdwNone_GeomW3W3_F_sparc64_hpc_ace_double
+ * Electrostatics interaction: CubicSplineTable
+ * VdW interaction:            None
+ * Geometry:                   Water3-Water3
+ * Calculate force/pot:        Force
+ */
+void
+nb_kernel_ElecCSTab_VdwNone_GeomW3W3_F_sparc64_hpc_ace_double
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwjidx0A,vdwjidx0B;
+    _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    int              vdwjidx1A,vdwjidx1B;
+    _fjsp_v2r8       jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+    int              vdwjidx2A,vdwjidx2B;
+    _fjsp_v2r8       jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+    _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    _fjsp_v2r8       dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
+    _fjsp_v2r8       dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
+    _fjsp_v2r8       dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    _fjsp_v2r8       dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+    _fjsp_v2r8       dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+    _fjsp_v2r8       dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    _fjsp_v2r8       dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+    _fjsp_v2r8       dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+    _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    _fjsp_v2r8       rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF,twovfeps;
+    real             *vftab;
+    _fjsp_v2r8       itab_tmp;
+    _fjsp_v2r8       dummy_mask,cutoff_mask;
+    _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+    _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+    union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+    charge           = mdatoms->chargeA;
+
+    vftab            = kernel_data->table_elec->data;
+    vftabscale       = gmx_fjsp_set1_v2r8(kernel_data->table_elec->scale);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq0              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+0]));
+    iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+    iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+
+    jq0              = gmx_fjsp_set1_v2r8(charge[inr+0]);
+    jq1              = gmx_fjsp_set1_v2r8(charge[inr+1]);
+    jq2              = gmx_fjsp_set1_v2r8(charge[inr+2]);
+    qq00             = _fjsp_mul_v2r8(iq0,jq0);
+    qq01             = _fjsp_mul_v2r8(iq0,jq1);
+    qq02             = _fjsp_mul_v2r8(iq0,jq2);
+    qq10             = _fjsp_mul_v2r8(iq1,jq0);
+    qq11             = _fjsp_mul_v2r8(iq1,jq1);
+    qq12             = _fjsp_mul_v2r8(iq1,jq2);
+    qq20             = _fjsp_mul_v2r8(iq2,jq0);
+    qq21             = _fjsp_mul_v2r8(iq2,jq1);
+    qq22             = _fjsp_mul_v2r8(iq2,jq2);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_fjsp_load_shift_and_3rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
+
+        fix0             = _fjsp_setzero_v2r8();
+        fiy0             = _fjsp_setzero_v2r8();
+        fiz0             = _fjsp_setzero_v2r8();
+        fix1             = _fjsp_setzero_v2r8();
+        fiy1             = _fjsp_setzero_v2r8();
+        fiz1             = _fjsp_setzero_v2r8();
+        fix2             = _fjsp_setzero_v2r8();
+        fiy2             = _fjsp_setzero_v2r8();
+        fiz2             = _fjsp_setzero_v2r8();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_3rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx01             = _fjsp_sub_v2r8(ix0,jx1);
+            dy01             = _fjsp_sub_v2r8(iy0,jy1);
+            dz01             = _fjsp_sub_v2r8(iz0,jz1);
+            dx02             = _fjsp_sub_v2r8(ix0,jx2);
+            dy02             = _fjsp_sub_v2r8(iy0,jy2);
+            dz02             = _fjsp_sub_v2r8(iz0,jz2);
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx11             = _fjsp_sub_v2r8(ix1,jx1);
+            dy11             = _fjsp_sub_v2r8(iy1,jy1);
+            dz11             = _fjsp_sub_v2r8(iz1,jz1);
+            dx12             = _fjsp_sub_v2r8(ix1,jx2);
+            dy12             = _fjsp_sub_v2r8(iy1,jy2);
+            dz12             = _fjsp_sub_v2r8(iz1,jz2);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+            dx21             = _fjsp_sub_v2r8(ix2,jx1);
+            dy21             = _fjsp_sub_v2r8(iy2,jy1);
+            dz21             = _fjsp_sub_v2r8(iz2,jz1);
+            dx22             = _fjsp_sub_v2r8(ix2,jx2);
+            dy22             = _fjsp_sub_v2r8(iy2,jy2);
+            dz22             = _fjsp_sub_v2r8(iz2,jz2);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq01            = gmx_fjsp_calc_rsq_v2r8(dx01,dy01,dz01);
+            rsq02            = gmx_fjsp_calc_rsq_v2r8(dx02,dy02,dz02);
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+            rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+            rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+            rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv01           = gmx_fjsp_invsqrt_v2r8(rsq01);
+            rinv02           = gmx_fjsp_invsqrt_v2r8(rsq02);
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+            rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+            rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+            rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+            fjx1             = _fjsp_setzero_v2r8();
+            fjy1             = _fjsp_setzero_v2r8();
+            fjz1             = _fjsp_setzero_v2r8();
+            fjx2             = _fjsp_setzero_v2r8();
+            fjy2             = _fjsp_setzero_v2r8();
+            fjz2             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r00,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 4;
+            vfconv.i[1]     *= 4;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq00,FF),_fjsp_mul_v2r8(vftabscale,rinv00)));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r01              = _fjsp_mul_v2r8(rsq01,rinv01);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r01,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 4;
+            vfconv.i[1]     *= 4;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq01,FF),_fjsp_mul_v2r8(vftabscale,rinv01)));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx01,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy01,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz01,fscal,fiz0);
+            
+            fjx1             = _fjsp_madd_v2r8(dx01,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy01,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz01,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r02              = _fjsp_mul_v2r8(rsq02,rinv02);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r02,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 4;
+            vfconv.i[1]     *= 4;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq02,FF),_fjsp_mul_v2r8(vftabscale,rinv02)));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx02,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy02,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz02,fscal,fiz0);
+            
+            fjx2             = _fjsp_madd_v2r8(dx02,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy02,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz02,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r10              = _fjsp_mul_v2r8(rsq10,rinv10);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r10,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 4;
+            vfconv.i[1]     *= 4;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq10,FF),_fjsp_mul_v2r8(vftabscale,rinv10)));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r11              = _fjsp_mul_v2r8(rsq11,rinv11);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r11,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 4;
+            vfconv.i[1]     *= 4;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq11,FF),_fjsp_mul_v2r8(vftabscale,rinv11)));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+            
+            fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r12              = _fjsp_mul_v2r8(rsq12,rinv12);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r12,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 4;
+            vfconv.i[1]     *= 4;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq12,FF),_fjsp_mul_v2r8(vftabscale,rinv12)));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+            
+            fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r20              = _fjsp_mul_v2r8(rsq20,rinv20);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r20,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 4;
+            vfconv.i[1]     *= 4;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq20,FF),_fjsp_mul_v2r8(vftabscale,rinv20)));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r21              = _fjsp_mul_v2r8(rsq21,rinv21);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r21,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 4;
+            vfconv.i[1]     *= 4;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq21,FF),_fjsp_mul_v2r8(vftabscale,rinv21)));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+            
+            fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r22              = _fjsp_mul_v2r8(rsq22,rinv22);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r22,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 4;
+            vfconv.i[1]     *= 4;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq22,FF),_fjsp_mul_v2r8(vftabscale,rinv22)));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+            
+            fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+
+            gmx_fjsp_decrement_3rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+
+            /* Inner loop uses 378 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_3rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx01             = _fjsp_sub_v2r8(ix0,jx1);
+            dy01             = _fjsp_sub_v2r8(iy0,jy1);
+            dz01             = _fjsp_sub_v2r8(iz0,jz1);
+            dx02             = _fjsp_sub_v2r8(ix0,jx2);
+            dy02             = _fjsp_sub_v2r8(iy0,jy2);
+            dz02             = _fjsp_sub_v2r8(iz0,jz2);
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx11             = _fjsp_sub_v2r8(ix1,jx1);
+            dy11             = _fjsp_sub_v2r8(iy1,jy1);
+            dz11             = _fjsp_sub_v2r8(iz1,jz1);
+            dx12             = _fjsp_sub_v2r8(ix1,jx2);
+            dy12             = _fjsp_sub_v2r8(iy1,jy2);
+            dz12             = _fjsp_sub_v2r8(iz1,jz2);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+            dx21             = _fjsp_sub_v2r8(ix2,jx1);
+            dy21             = _fjsp_sub_v2r8(iy2,jy1);
+            dz21             = _fjsp_sub_v2r8(iz2,jz1);
+            dx22             = _fjsp_sub_v2r8(ix2,jx2);
+            dy22             = _fjsp_sub_v2r8(iy2,jy2);
+            dz22             = _fjsp_sub_v2r8(iz2,jz2);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq01            = gmx_fjsp_calc_rsq_v2r8(dx01,dy01,dz01);
+            rsq02            = gmx_fjsp_calc_rsq_v2r8(dx02,dy02,dz02);
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+            rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+            rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+            rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv01           = gmx_fjsp_invsqrt_v2r8(rsq01);
+            rinv02           = gmx_fjsp_invsqrt_v2r8(rsq02);
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+            rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+            rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+            rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+            fjx1             = _fjsp_setzero_v2r8();
+            fjy1             = _fjsp_setzero_v2r8();
+            fjz1             = _fjsp_setzero_v2r8();
+            fjx2             = _fjsp_setzero_v2r8();
+            fjy2             = _fjsp_setzero_v2r8();
+            fjz2             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r00,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 4;
+            vfconv.i[1]     *= 4;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq00,FF),_fjsp_mul_v2r8(vftabscale,rinv00)));
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r01              = _fjsp_mul_v2r8(rsq01,rinv01);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r01,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 4;
+            vfconv.i[1]     *= 4;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq01,FF),_fjsp_mul_v2r8(vftabscale,rinv01)));
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx01,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy01,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz01,fscal,fiz0);
+            
+            fjx1             = _fjsp_madd_v2r8(dx01,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy01,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz01,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r02              = _fjsp_mul_v2r8(rsq02,rinv02);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r02,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 4;
+            vfconv.i[1]     *= 4;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq02,FF),_fjsp_mul_v2r8(vftabscale,rinv02)));
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx02,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy02,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz02,fscal,fiz0);
+            
+            fjx2             = _fjsp_madd_v2r8(dx02,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy02,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz02,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r10              = _fjsp_mul_v2r8(rsq10,rinv10);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r10,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 4;
+            vfconv.i[1]     *= 4;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq10,FF),_fjsp_mul_v2r8(vftabscale,rinv10)));
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r11              = _fjsp_mul_v2r8(rsq11,rinv11);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r11,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 4;
+            vfconv.i[1]     *= 4;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq11,FF),_fjsp_mul_v2r8(vftabscale,rinv11)));
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+            
+            fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r12              = _fjsp_mul_v2r8(rsq12,rinv12);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r12,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 4;
+            vfconv.i[1]     *= 4;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq12,FF),_fjsp_mul_v2r8(vftabscale,rinv12)));
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+            
+            fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r20              = _fjsp_mul_v2r8(rsq20,rinv20);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r20,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 4;
+            vfconv.i[1]     *= 4;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq20,FF),_fjsp_mul_v2r8(vftabscale,rinv20)));
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r21              = _fjsp_mul_v2r8(rsq21,rinv21);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r21,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 4;
+            vfconv.i[1]     *= 4;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq21,FF),_fjsp_mul_v2r8(vftabscale,rinv21)));
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+            
+            fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r22              = _fjsp_mul_v2r8(rsq22,rinv22);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r22,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 4;
+            vfconv.i[1]     *= 4;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq22,FF),_fjsp_mul_v2r8(vftabscale,rinv22)));
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+            
+            fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+
+            gmx_fjsp_decrement_3rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+
+            /* Inner loop uses 378 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_fjsp_update_iforce_3atom_swizzle_v2r8(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 18 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W3W3_F,outeriter*18 + inneriter*378);
+}
diff --git a/src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecCSTab_VdwNone_GeomW4P1_sparc64_hpc_ace_double.c b/src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecCSTab_VdwNone_GeomW4P1_sparc64_hpc_ace_double.c
new file mode 100644 (file)
index 0000000..92809fa
--- /dev/null
@@ -0,0 +1,1026 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2012, by the GROMACS development team, led by
+ * David van der Spoel, Berk Hess, Erik Lindahl, and including many
+ * others, as listed in the AUTHORS file in the top-level source
+ * directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*
+ * Note: this file was generated by the GROMACS sparc64_hpc_ace_double kernel generator.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "kernelutil_sparc64_hpc_ace_double.h"
+
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecCSTab_VdwNone_GeomW4P1_VF_sparc64_hpc_ace_double
+ * Electrostatics interaction: CubicSplineTable
+ * VdW interaction:            None
+ * Geometry:                   Water4-Particle
+ * Calculate force/pot:        PotentialAndForce
+ */
+void
+nb_kernel_ElecCSTab_VdwNone_GeomW4P1_VF_sparc64_hpc_ace_double
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset1;
+    _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwioffset3;
+    _fjsp_v2r8       ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+    int              vdwjidx0A,vdwjidx0B;
+    _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    _fjsp_v2r8       dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    _fjsp_v2r8       dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    _fjsp_v2r8       dx30,dy30,dz30,rsq30,rinv30,rinvsq30,r30,qq30,c6_30,c12_30;
+    _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    _fjsp_v2r8       rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF,twovfeps;
+    real             *vftab;
+    _fjsp_v2r8       itab_tmp;
+    _fjsp_v2r8       dummy_mask,cutoff_mask;
+    _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+    _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+    union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+    charge           = mdatoms->chargeA;
+
+    vftab            = kernel_data->table_elec->data;
+    vftabscale       = gmx_fjsp_set1_v2r8(kernel_data->table_elec->scale);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+    iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+    iq3              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+3]));
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_fjsp_load_shift_and_3rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset+DIM,
+                                                 &ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
+
+        fix1             = _fjsp_setzero_v2r8();
+        fiy1             = _fjsp_setzero_v2r8();
+        fiz1             = _fjsp_setzero_v2r8();
+        fix2             = _fjsp_setzero_v2r8();
+        fiy2             = _fjsp_setzero_v2r8();
+        fiz2             = _fjsp_setzero_v2r8();
+        fix3             = _fjsp_setzero_v2r8();
+        fiy3             = _fjsp_setzero_v2r8();
+        fiz3             = _fjsp_setzero_v2r8();
+
+        /* Reset potential sums */
+        velecsum         = _fjsp_setzero_v2r8();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+            dx30             = _fjsp_sub_v2r8(ix3,jx0);
+            dy30             = _fjsp_sub_v2r8(iy3,jy0);
+            dz30             = _fjsp_sub_v2r8(iz3,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+            rsq30            = gmx_fjsp_calc_rsq_v2r8(dx30,dy30,dz30);
+
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+            rinv30           = gmx_fjsp_invsqrt_v2r8(rsq30);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_fjsp_load_2real_swizzle_v2r8(charge+jnrA+0,charge+jnrB+0);
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r10              = _fjsp_mul_v2r8(rsq10,rinv10);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _fjsp_mul_v2r8(iq1,jq0);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r10,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 4;
+            vfconv.i[1]     *= 4;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            velec            = _fjsp_mul_v2r8(qq10,VV);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq10,FF),_fjsp_mul_v2r8(vftabscale,rinv10)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r20              = _fjsp_mul_v2r8(rsq20,rinv20);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _fjsp_mul_v2r8(iq2,jq0);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r20,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 4;
+            vfconv.i[1]     *= 4;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            velec            = _fjsp_mul_v2r8(qq20,VV);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq20,FF),_fjsp_mul_v2r8(vftabscale,rinv20)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r30              = _fjsp_mul_v2r8(rsq30,rinv30);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq30             = _fjsp_mul_v2r8(iq3,jq0);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r30,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 4;
+            vfconv.i[1]     *= 4;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            velec            = _fjsp_mul_v2r8(qq30,VV);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq30,FF),_fjsp_mul_v2r8(vftabscale,rinv30)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx30,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy30,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz30,fscal,fiz3);
+            
+            fjx0             = _fjsp_madd_v2r8(dx30,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy30,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz30,fscal,fjz0);
+
+            gmx_fjsp_decrement_1rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 141 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+            dx30             = _fjsp_sub_v2r8(ix3,jx0);
+            dy30             = _fjsp_sub_v2r8(iy3,jy0);
+            dz30             = _fjsp_sub_v2r8(iz3,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+            rsq30            = gmx_fjsp_calc_rsq_v2r8(dx30,dy30,dz30);
+
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+            rinv30           = gmx_fjsp_invsqrt_v2r8(rsq30);
+
+            /* Load parameters for j particles */
+            jq0              = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),charge+jnrA+0);
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r10              = _fjsp_mul_v2r8(rsq10,rinv10);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _fjsp_mul_v2r8(iq1,jq0);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r10,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 4;
+            vfconv.i[1]     *= 4;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            velec            = _fjsp_mul_v2r8(qq10,VV);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq10,FF),_fjsp_mul_v2r8(vftabscale,rinv10)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r20              = _fjsp_mul_v2r8(rsq20,rinv20);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _fjsp_mul_v2r8(iq2,jq0);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r20,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 4;
+            vfconv.i[1]     *= 4;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            velec            = _fjsp_mul_v2r8(qq20,VV);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq20,FF),_fjsp_mul_v2r8(vftabscale,rinv20)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r30              = _fjsp_mul_v2r8(rsq30,rinv30);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq30             = _fjsp_mul_v2r8(iq3,jq0);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r30,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 4;
+            vfconv.i[1]     *= 4;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            velec            = _fjsp_mul_v2r8(qq30,VV);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq30,FF),_fjsp_mul_v2r8(vftabscale,rinv30)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx30,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy30,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz30,fscal,fiz3);
+            
+            fjx0             = _fjsp_madd_v2r8(dx30,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy30,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz30,fscal,fjz0);
+
+            gmx_fjsp_decrement_1rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 141 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_fjsp_update_iforce_3atom_swizzle_v2r8(fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+                                              f+i_coord_offset+DIM,fshift+i_shift_offset);
+
+        ggid                        = gid[iidx];
+        /* Update potential energies */
+        gmx_fjsp_update_1pot_v2r8(velecsum,kernel_data->energygrp_elec+ggid);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 19 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W4_VF,outeriter*19 + inneriter*141);
+}
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecCSTab_VdwNone_GeomW4P1_F_sparc64_hpc_ace_double
+ * Electrostatics interaction: CubicSplineTable
+ * VdW interaction:            None
+ * Geometry:                   Water4-Particle
+ * Calculate force/pot:        Force
+ */
+void
+nb_kernel_ElecCSTab_VdwNone_GeomW4P1_F_sparc64_hpc_ace_double
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset1;
+    _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwioffset3;
+    _fjsp_v2r8       ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+    int              vdwjidx0A,vdwjidx0B;
+    _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    _fjsp_v2r8       dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    _fjsp_v2r8       dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    _fjsp_v2r8       dx30,dy30,dz30,rsq30,rinv30,rinvsq30,r30,qq30,c6_30,c12_30;
+    _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    _fjsp_v2r8       rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF,twovfeps;
+    real             *vftab;
+    _fjsp_v2r8       itab_tmp;
+    _fjsp_v2r8       dummy_mask,cutoff_mask;
+    _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+    _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+    union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+    charge           = mdatoms->chargeA;
+
+    vftab            = kernel_data->table_elec->data;
+    vftabscale       = gmx_fjsp_set1_v2r8(kernel_data->table_elec->scale);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+    iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+    iq3              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+3]));
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_fjsp_load_shift_and_3rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset+DIM,
+                                                 &ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
+
+        fix1             = _fjsp_setzero_v2r8();
+        fiy1             = _fjsp_setzero_v2r8();
+        fiz1             = _fjsp_setzero_v2r8();
+        fix2             = _fjsp_setzero_v2r8();
+        fiy2             = _fjsp_setzero_v2r8();
+        fiz2             = _fjsp_setzero_v2r8();
+        fix3             = _fjsp_setzero_v2r8();
+        fiy3             = _fjsp_setzero_v2r8();
+        fiz3             = _fjsp_setzero_v2r8();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+            dx30             = _fjsp_sub_v2r8(ix3,jx0);
+            dy30             = _fjsp_sub_v2r8(iy3,jy0);
+            dz30             = _fjsp_sub_v2r8(iz3,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+            rsq30            = gmx_fjsp_calc_rsq_v2r8(dx30,dy30,dz30);
+
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+            rinv30           = gmx_fjsp_invsqrt_v2r8(rsq30);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_fjsp_load_2real_swizzle_v2r8(charge+jnrA+0,charge+jnrB+0);
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r10              = _fjsp_mul_v2r8(rsq10,rinv10);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _fjsp_mul_v2r8(iq1,jq0);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r10,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 4;
+            vfconv.i[1]     *= 4;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq10,FF),_fjsp_mul_v2r8(vftabscale,rinv10)));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r20              = _fjsp_mul_v2r8(rsq20,rinv20);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _fjsp_mul_v2r8(iq2,jq0);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r20,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 4;
+            vfconv.i[1]     *= 4;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq20,FF),_fjsp_mul_v2r8(vftabscale,rinv20)));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r30              = _fjsp_mul_v2r8(rsq30,rinv30);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq30             = _fjsp_mul_v2r8(iq3,jq0);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r30,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 4;
+            vfconv.i[1]     *= 4;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq30,FF),_fjsp_mul_v2r8(vftabscale,rinv30)));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx30,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy30,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz30,fscal,fiz3);
+            
+            fjx0             = _fjsp_madd_v2r8(dx30,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy30,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz30,fscal,fjz0);
+
+            gmx_fjsp_decrement_1rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 129 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+            dx30             = _fjsp_sub_v2r8(ix3,jx0);
+            dy30             = _fjsp_sub_v2r8(iy3,jy0);
+            dz30             = _fjsp_sub_v2r8(iz3,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+            rsq30            = gmx_fjsp_calc_rsq_v2r8(dx30,dy30,dz30);
+
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+            rinv30           = gmx_fjsp_invsqrt_v2r8(rsq30);
+
+            /* Load parameters for j particles */
+            jq0              = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),charge+jnrA+0);
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r10              = _fjsp_mul_v2r8(rsq10,rinv10);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _fjsp_mul_v2r8(iq1,jq0);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r10,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 4;
+            vfconv.i[1]     *= 4;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq10,FF),_fjsp_mul_v2r8(vftabscale,rinv10)));
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r20              = _fjsp_mul_v2r8(rsq20,rinv20);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _fjsp_mul_v2r8(iq2,jq0);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r20,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 4;
+            vfconv.i[1]     *= 4;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq20,FF),_fjsp_mul_v2r8(vftabscale,rinv20)));
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r30              = _fjsp_mul_v2r8(rsq30,rinv30);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq30             = _fjsp_mul_v2r8(iq3,jq0);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r30,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 4;
+            vfconv.i[1]     *= 4;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq30,FF),_fjsp_mul_v2r8(vftabscale,rinv30)));
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx30,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy30,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz30,fscal,fiz3);
+            
+            fjx0             = _fjsp_madd_v2r8(dx30,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy30,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz30,fscal,fjz0);
+
+            gmx_fjsp_decrement_1rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 129 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_fjsp_update_iforce_3atom_swizzle_v2r8(fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+                                              f+i_coord_offset+DIM,fshift+i_shift_offset);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 18 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W4_F,outeriter*18 + inneriter*129);
+}
diff --git a/src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecCSTab_VdwNone_GeomW4W4_sparc64_hpc_ace_double.c b/src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecCSTab_VdwNone_GeomW4W4_sparc64_hpc_ace_double.c
new file mode 100644 (file)
index 0000000..c865db2
--- /dev/null
@@ -0,0 +1,2170 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2012, by the GROMACS development team, led by
+ * David van der Spoel, Berk Hess, Erik Lindahl, and including many
+ * others, as listed in the AUTHORS file in the top-level source
+ * directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*
+ * Note: this file was generated by the GROMACS sparc64_hpc_ace_double kernel generator.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "kernelutil_sparc64_hpc_ace_double.h"
+
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecCSTab_VdwNone_GeomW4W4_VF_sparc64_hpc_ace_double
+ * Electrostatics interaction: CubicSplineTable
+ * VdW interaction:            None
+ * Geometry:                   Water4-Water4
+ * Calculate force/pot:        PotentialAndForce
+ */
+void
+nb_kernel_ElecCSTab_VdwNone_GeomW4W4_VF_sparc64_hpc_ace_double
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset1;
+    _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwioffset3;
+    _fjsp_v2r8       ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+    int              vdwjidx1A,vdwjidx1B;
+    _fjsp_v2r8       jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+    int              vdwjidx2A,vdwjidx2B;
+    _fjsp_v2r8       jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+    int              vdwjidx3A,vdwjidx3B;
+    _fjsp_v2r8       jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
+    _fjsp_v2r8       dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+    _fjsp_v2r8       dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+    _fjsp_v2r8       dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
+    _fjsp_v2r8       dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+    _fjsp_v2r8       dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+    _fjsp_v2r8       dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
+    _fjsp_v2r8       dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
+    _fjsp_v2r8       dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
+    _fjsp_v2r8       dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
+    _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    _fjsp_v2r8       rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF,twovfeps;
+    real             *vftab;
+    _fjsp_v2r8       itab_tmp;
+    _fjsp_v2r8       dummy_mask,cutoff_mask;
+    _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+    _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+    union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+    charge           = mdatoms->chargeA;
+
+    vftab            = kernel_data->table_elec->data;
+    vftabscale       = gmx_fjsp_set1_v2r8(kernel_data->table_elec->scale);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+    iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+    iq3              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+3]));
+
+    jq1              = gmx_fjsp_set1_v2r8(charge[inr+1]);
+    jq2              = gmx_fjsp_set1_v2r8(charge[inr+2]);
+    jq3              = gmx_fjsp_set1_v2r8(charge[inr+3]);
+    qq11             = _fjsp_mul_v2r8(iq1,jq1);
+    qq12             = _fjsp_mul_v2r8(iq1,jq2);
+    qq13             = _fjsp_mul_v2r8(iq1,jq3);
+    qq21             = _fjsp_mul_v2r8(iq2,jq1);
+    qq22             = _fjsp_mul_v2r8(iq2,jq2);
+    qq23             = _fjsp_mul_v2r8(iq2,jq3);
+    qq31             = _fjsp_mul_v2r8(iq3,jq1);
+    qq32             = _fjsp_mul_v2r8(iq3,jq2);
+    qq33             = _fjsp_mul_v2r8(iq3,jq3);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_fjsp_load_shift_and_3rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset+DIM,
+                                                 &ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
+
+        fix1             = _fjsp_setzero_v2r8();
+        fiy1             = _fjsp_setzero_v2r8();
+        fiz1             = _fjsp_setzero_v2r8();
+        fix2             = _fjsp_setzero_v2r8();
+        fiy2             = _fjsp_setzero_v2r8();
+        fiz2             = _fjsp_setzero_v2r8();
+        fix3             = _fjsp_setzero_v2r8();
+        fiy3             = _fjsp_setzero_v2r8();
+        fiz3             = _fjsp_setzero_v2r8();
+
+        /* Reset potential sums */
+        velecsum         = _fjsp_setzero_v2r8();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_3rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA+DIM,x+j_coord_offsetB+DIM,
+                                              &jx1,&jy1,&jz1,&jx2,&jy2,&jz2,&jx3,&jy3,&jz3);
+
+            /* Calculate displacement vector */
+            dx11             = _fjsp_sub_v2r8(ix1,jx1);
+            dy11             = _fjsp_sub_v2r8(iy1,jy1);
+            dz11             = _fjsp_sub_v2r8(iz1,jz1);
+            dx12             = _fjsp_sub_v2r8(ix1,jx2);
+            dy12             = _fjsp_sub_v2r8(iy1,jy2);
+            dz12             = _fjsp_sub_v2r8(iz1,jz2);
+            dx13             = _fjsp_sub_v2r8(ix1,jx3);
+            dy13             = _fjsp_sub_v2r8(iy1,jy3);
+            dz13             = _fjsp_sub_v2r8(iz1,jz3);
+            dx21             = _fjsp_sub_v2r8(ix2,jx1);
+            dy21             = _fjsp_sub_v2r8(iy2,jy1);
+            dz21             = _fjsp_sub_v2r8(iz2,jz1);
+            dx22             = _fjsp_sub_v2r8(ix2,jx2);
+            dy22             = _fjsp_sub_v2r8(iy2,jy2);
+            dz22             = _fjsp_sub_v2r8(iz2,jz2);
+            dx23             = _fjsp_sub_v2r8(ix2,jx3);
+            dy23             = _fjsp_sub_v2r8(iy2,jy3);
+            dz23             = _fjsp_sub_v2r8(iz2,jz3);
+            dx31             = _fjsp_sub_v2r8(ix3,jx1);
+            dy31             = _fjsp_sub_v2r8(iy3,jy1);
+            dz31             = _fjsp_sub_v2r8(iz3,jz1);
+            dx32             = _fjsp_sub_v2r8(ix3,jx2);
+            dy32             = _fjsp_sub_v2r8(iy3,jy2);
+            dz32             = _fjsp_sub_v2r8(iz3,jz2);
+            dx33             = _fjsp_sub_v2r8(ix3,jx3);
+            dy33             = _fjsp_sub_v2r8(iy3,jy3);
+            dz33             = _fjsp_sub_v2r8(iz3,jz3);
+
+            /* Calculate squared distance and things based on it */
+            rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+            rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+            rsq13            = gmx_fjsp_calc_rsq_v2r8(dx13,dy13,dz13);
+            rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+            rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+            rsq23            = gmx_fjsp_calc_rsq_v2r8(dx23,dy23,dz23);
+            rsq31            = gmx_fjsp_calc_rsq_v2r8(dx31,dy31,dz31);
+            rsq32            = gmx_fjsp_calc_rsq_v2r8(dx32,dy32,dz32);
+            rsq33            = gmx_fjsp_calc_rsq_v2r8(dx33,dy33,dz33);
+
+            rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+            rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+            rinv13           = gmx_fjsp_invsqrt_v2r8(rsq13);
+            rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+            rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+            rinv23           = gmx_fjsp_invsqrt_v2r8(rsq23);
+            rinv31           = gmx_fjsp_invsqrt_v2r8(rsq31);
+            rinv32           = gmx_fjsp_invsqrt_v2r8(rsq32);
+            rinv33           = gmx_fjsp_invsqrt_v2r8(rsq33);
+
+            fjx1             = _fjsp_setzero_v2r8();
+            fjy1             = _fjsp_setzero_v2r8();
+            fjz1             = _fjsp_setzero_v2r8();
+            fjx2             = _fjsp_setzero_v2r8();
+            fjy2             = _fjsp_setzero_v2r8();
+            fjz2             = _fjsp_setzero_v2r8();
+            fjx3             = _fjsp_setzero_v2r8();
+            fjy3             = _fjsp_setzero_v2r8();
+            fjz3             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r11              = _fjsp_mul_v2r8(rsq11,rinv11);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r11,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 4;
+            vfconv.i[1]     *= 4;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            velec            = _fjsp_mul_v2r8(qq11,VV);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq11,FF),_fjsp_mul_v2r8(vftabscale,rinv11)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+            
+            fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r12              = _fjsp_mul_v2r8(rsq12,rinv12);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r12,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 4;
+            vfconv.i[1]     *= 4;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            velec            = _fjsp_mul_v2r8(qq12,VV);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq12,FF),_fjsp_mul_v2r8(vftabscale,rinv12)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+            
+            fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r13              = _fjsp_mul_v2r8(rsq13,rinv13);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r13,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 4;
+            vfconv.i[1]     *= 4;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            velec            = _fjsp_mul_v2r8(qq13,VV);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq13,FF),_fjsp_mul_v2r8(vftabscale,rinv13)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx13,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy13,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz13,fscal,fiz1);
+            
+            fjx3             = _fjsp_madd_v2r8(dx13,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy13,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz13,fscal,fjz3);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r21              = _fjsp_mul_v2r8(rsq21,rinv21);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r21,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 4;
+            vfconv.i[1]     *= 4;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            velec            = _fjsp_mul_v2r8(qq21,VV);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq21,FF),_fjsp_mul_v2r8(vftabscale,rinv21)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+            
+            fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r22              = _fjsp_mul_v2r8(rsq22,rinv22);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r22,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 4;
+            vfconv.i[1]     *= 4;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            velec            = _fjsp_mul_v2r8(qq22,VV);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq22,FF),_fjsp_mul_v2r8(vftabscale,rinv22)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+            
+            fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r23              = _fjsp_mul_v2r8(rsq23,rinv23);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r23,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 4;
+            vfconv.i[1]     *= 4;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            velec            = _fjsp_mul_v2r8(qq23,VV);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq23,FF),_fjsp_mul_v2r8(vftabscale,rinv23)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx23,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy23,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz23,fscal,fiz2);
+            
+            fjx3             = _fjsp_madd_v2r8(dx23,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy23,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz23,fscal,fjz3);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r31              = _fjsp_mul_v2r8(rsq31,rinv31);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r31,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 4;
+            vfconv.i[1]     *= 4;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            velec            = _fjsp_mul_v2r8(qq31,VV);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq31,FF),_fjsp_mul_v2r8(vftabscale,rinv31)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx31,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy31,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz31,fscal,fiz3);
+            
+            fjx1             = _fjsp_madd_v2r8(dx31,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy31,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz31,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r32              = _fjsp_mul_v2r8(rsq32,rinv32);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r32,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 4;
+            vfconv.i[1]     *= 4;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            velec            = _fjsp_mul_v2r8(qq32,VV);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq32,FF),_fjsp_mul_v2r8(vftabscale,rinv32)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx32,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy32,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz32,fscal,fiz3);
+            
+            fjx2             = _fjsp_madd_v2r8(dx32,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy32,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz32,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r33              = _fjsp_mul_v2r8(rsq33,rinv33);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r33,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 4;
+            vfconv.i[1]     *= 4;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            velec            = _fjsp_mul_v2r8(qq33,VV);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq33,FF),_fjsp_mul_v2r8(vftabscale,rinv33)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx33,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy33,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz33,fscal,fiz3);
+            
+            fjx3             = _fjsp_madd_v2r8(dx33,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy33,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz33,fscal,fjz3);
+
+            gmx_fjsp_decrement_3rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA+DIM,f+j_coord_offsetB+DIM,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+
+            /* Inner loop uses 414 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_3rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA+DIM,
+                                              &jx1,&jy1,&jz1,&jx2,&jy2,&jz2,&jx3,&jy3,&jz3);
+
+            /* Calculate displacement vector */
+            dx11             = _fjsp_sub_v2r8(ix1,jx1);
+            dy11             = _fjsp_sub_v2r8(iy1,jy1);
+            dz11             = _fjsp_sub_v2r8(iz1,jz1);
+            dx12             = _fjsp_sub_v2r8(ix1,jx2);
+            dy12             = _fjsp_sub_v2r8(iy1,jy2);
+            dz12             = _fjsp_sub_v2r8(iz1,jz2);
+            dx13             = _fjsp_sub_v2r8(ix1,jx3);
+            dy13             = _fjsp_sub_v2r8(iy1,jy3);
+            dz13             = _fjsp_sub_v2r8(iz1,jz3);
+            dx21             = _fjsp_sub_v2r8(ix2,jx1);
+            dy21             = _fjsp_sub_v2r8(iy2,jy1);
+            dz21             = _fjsp_sub_v2r8(iz2,jz1);
+            dx22             = _fjsp_sub_v2r8(ix2,jx2);
+            dy22             = _fjsp_sub_v2r8(iy2,jy2);
+            dz22             = _fjsp_sub_v2r8(iz2,jz2);
+            dx23             = _fjsp_sub_v2r8(ix2,jx3);
+            dy23             = _fjsp_sub_v2r8(iy2,jy3);
+            dz23             = _fjsp_sub_v2r8(iz2,jz3);
+            dx31             = _fjsp_sub_v2r8(ix3,jx1);
+            dy31             = _fjsp_sub_v2r8(iy3,jy1);
+            dz31             = _fjsp_sub_v2r8(iz3,jz1);
+            dx32             = _fjsp_sub_v2r8(ix3,jx2);
+            dy32             = _fjsp_sub_v2r8(iy3,jy2);
+            dz32             = _fjsp_sub_v2r8(iz3,jz2);
+            dx33             = _fjsp_sub_v2r8(ix3,jx3);
+            dy33             = _fjsp_sub_v2r8(iy3,jy3);
+            dz33             = _fjsp_sub_v2r8(iz3,jz3);
+
+            /* Calculate squared distance and things based on it */
+            rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+            rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+            rsq13            = gmx_fjsp_calc_rsq_v2r8(dx13,dy13,dz13);
+            rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+            rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+            rsq23            = gmx_fjsp_calc_rsq_v2r8(dx23,dy23,dz23);
+            rsq31            = gmx_fjsp_calc_rsq_v2r8(dx31,dy31,dz31);
+            rsq32            = gmx_fjsp_calc_rsq_v2r8(dx32,dy32,dz32);
+            rsq33            = gmx_fjsp_calc_rsq_v2r8(dx33,dy33,dz33);
+
+            rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+            rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+            rinv13           = gmx_fjsp_invsqrt_v2r8(rsq13);
+            rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+            rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+            rinv23           = gmx_fjsp_invsqrt_v2r8(rsq23);
+            rinv31           = gmx_fjsp_invsqrt_v2r8(rsq31);
+            rinv32           = gmx_fjsp_invsqrt_v2r8(rsq32);
+            rinv33           = gmx_fjsp_invsqrt_v2r8(rsq33);
+
+            fjx1             = _fjsp_setzero_v2r8();
+            fjy1             = _fjsp_setzero_v2r8();
+            fjz1             = _fjsp_setzero_v2r8();
+            fjx2             = _fjsp_setzero_v2r8();
+            fjy2             = _fjsp_setzero_v2r8();
+            fjz2             = _fjsp_setzero_v2r8();
+            fjx3             = _fjsp_setzero_v2r8();
+            fjy3             = _fjsp_setzero_v2r8();
+            fjz3             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r11              = _fjsp_mul_v2r8(rsq11,rinv11);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r11,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 4;
+            vfconv.i[1]     *= 4;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            velec            = _fjsp_mul_v2r8(qq11,VV);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq11,FF),_fjsp_mul_v2r8(vftabscale,rinv11)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+            
+            fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r12              = _fjsp_mul_v2r8(rsq12,rinv12);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r12,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 4;
+            vfconv.i[1]     *= 4;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            velec            = _fjsp_mul_v2r8(qq12,VV);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq12,FF),_fjsp_mul_v2r8(vftabscale,rinv12)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+            
+            fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r13              = _fjsp_mul_v2r8(rsq13,rinv13);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r13,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 4;
+            vfconv.i[1]     *= 4;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            velec            = _fjsp_mul_v2r8(qq13,VV);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq13,FF),_fjsp_mul_v2r8(vftabscale,rinv13)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx13,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy13,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz13,fscal,fiz1);
+            
+            fjx3             = _fjsp_madd_v2r8(dx13,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy13,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz13,fscal,fjz3);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r21              = _fjsp_mul_v2r8(rsq21,rinv21);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r21,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 4;
+            vfconv.i[1]     *= 4;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            velec            = _fjsp_mul_v2r8(qq21,VV);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq21,FF),_fjsp_mul_v2r8(vftabscale,rinv21)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+            
+            fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r22              = _fjsp_mul_v2r8(rsq22,rinv22);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r22,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 4;
+            vfconv.i[1]     *= 4;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            velec            = _fjsp_mul_v2r8(qq22,VV);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq22,FF),_fjsp_mul_v2r8(vftabscale,rinv22)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+            
+            fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r23              = _fjsp_mul_v2r8(rsq23,rinv23);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r23,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 4;
+            vfconv.i[1]     *= 4;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            velec            = _fjsp_mul_v2r8(qq23,VV);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq23,FF),_fjsp_mul_v2r8(vftabscale,rinv23)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx23,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy23,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz23,fscal,fiz2);
+            
+            fjx3             = _fjsp_madd_v2r8(dx23,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy23,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz23,fscal,fjz3);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r31              = _fjsp_mul_v2r8(rsq31,rinv31);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r31,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 4;
+            vfconv.i[1]     *= 4;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            velec            = _fjsp_mul_v2r8(qq31,VV);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq31,FF),_fjsp_mul_v2r8(vftabscale,rinv31)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx31,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy31,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz31,fscal,fiz3);
+            
+            fjx1             = _fjsp_madd_v2r8(dx31,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy31,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz31,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r32              = _fjsp_mul_v2r8(rsq32,rinv32);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r32,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 4;
+            vfconv.i[1]     *= 4;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            velec            = _fjsp_mul_v2r8(qq32,VV);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq32,FF),_fjsp_mul_v2r8(vftabscale,rinv32)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx32,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy32,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz32,fscal,fiz3);
+            
+            fjx2             = _fjsp_madd_v2r8(dx32,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy32,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz32,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r33              = _fjsp_mul_v2r8(rsq33,rinv33);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r33,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 4;
+            vfconv.i[1]     *= 4;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            velec            = _fjsp_mul_v2r8(qq33,VV);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq33,FF),_fjsp_mul_v2r8(vftabscale,rinv33)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx33,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy33,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz33,fscal,fiz3);
+            
+            fjx3             = _fjsp_madd_v2r8(dx33,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy33,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz33,fscal,fjz3);
+
+            gmx_fjsp_decrement_3rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA+DIM,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+
+            /* Inner loop uses 414 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_fjsp_update_iforce_3atom_swizzle_v2r8(fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+                                              f+i_coord_offset+DIM,fshift+i_shift_offset);
+
+        ggid                        = gid[iidx];
+        /* Update potential energies */
+        gmx_fjsp_update_1pot_v2r8(velecsum,kernel_data->energygrp_elec+ggid);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 19 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W4W4_VF,outeriter*19 + inneriter*414);
+}
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecCSTab_VdwNone_GeomW4W4_F_sparc64_hpc_ace_double
+ * Electrostatics interaction: CubicSplineTable
+ * VdW interaction:            None
+ * Geometry:                   Water4-Water4
+ * Calculate force/pot:        Force
+ */
+void
+nb_kernel_ElecCSTab_VdwNone_GeomW4W4_F_sparc64_hpc_ace_double
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset1;
+    _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwioffset3;
+    _fjsp_v2r8       ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+    int              vdwjidx1A,vdwjidx1B;
+    _fjsp_v2r8       jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+    int              vdwjidx2A,vdwjidx2B;
+    _fjsp_v2r8       jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+    int              vdwjidx3A,vdwjidx3B;
+    _fjsp_v2r8       jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
+    _fjsp_v2r8       dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+    _fjsp_v2r8       dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+    _fjsp_v2r8       dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
+    _fjsp_v2r8       dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+    _fjsp_v2r8       dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+    _fjsp_v2r8       dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
+    _fjsp_v2r8       dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
+    _fjsp_v2r8       dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
+    _fjsp_v2r8       dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
+    _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    _fjsp_v2r8       rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF,twovfeps;
+    real             *vftab;
+    _fjsp_v2r8       itab_tmp;
+    _fjsp_v2r8       dummy_mask,cutoff_mask;
+    _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+    _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+    union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+    charge           = mdatoms->chargeA;
+
+    vftab            = kernel_data->table_elec->data;
+    vftabscale       = gmx_fjsp_set1_v2r8(kernel_data->table_elec->scale);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+    iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+    iq3              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+3]));
+
+    jq1              = gmx_fjsp_set1_v2r8(charge[inr+1]);
+    jq2              = gmx_fjsp_set1_v2r8(charge[inr+2]);
+    jq3              = gmx_fjsp_set1_v2r8(charge[inr+3]);
+    qq11             = _fjsp_mul_v2r8(iq1,jq1);
+    qq12             = _fjsp_mul_v2r8(iq1,jq2);
+    qq13             = _fjsp_mul_v2r8(iq1,jq3);
+    qq21             = _fjsp_mul_v2r8(iq2,jq1);
+    qq22             = _fjsp_mul_v2r8(iq2,jq2);
+    qq23             = _fjsp_mul_v2r8(iq2,jq3);
+    qq31             = _fjsp_mul_v2r8(iq3,jq1);
+    qq32             = _fjsp_mul_v2r8(iq3,jq2);
+    qq33             = _fjsp_mul_v2r8(iq3,jq3);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_fjsp_load_shift_and_3rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset+DIM,
+                                                 &ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
+
+        fix1             = _fjsp_setzero_v2r8();
+        fiy1             = _fjsp_setzero_v2r8();
+        fiz1             = _fjsp_setzero_v2r8();
+        fix2             = _fjsp_setzero_v2r8();
+        fiy2             = _fjsp_setzero_v2r8();
+        fiz2             = _fjsp_setzero_v2r8();
+        fix3             = _fjsp_setzero_v2r8();
+        fiy3             = _fjsp_setzero_v2r8();
+        fiz3             = _fjsp_setzero_v2r8();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_3rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA+DIM,x+j_coord_offsetB+DIM,
+                                              &jx1,&jy1,&jz1,&jx2,&jy2,&jz2,&jx3,&jy3,&jz3);
+
+            /* Calculate displacement vector */
+            dx11             = _fjsp_sub_v2r8(ix1,jx1);
+            dy11             = _fjsp_sub_v2r8(iy1,jy1);
+            dz11             = _fjsp_sub_v2r8(iz1,jz1);
+            dx12             = _fjsp_sub_v2r8(ix1,jx2);
+            dy12             = _fjsp_sub_v2r8(iy1,jy2);
+            dz12             = _fjsp_sub_v2r8(iz1,jz2);
+            dx13             = _fjsp_sub_v2r8(ix1,jx3);
+            dy13             = _fjsp_sub_v2r8(iy1,jy3);
+            dz13             = _fjsp_sub_v2r8(iz1,jz3);
+            dx21             = _fjsp_sub_v2r8(ix2,jx1);
+            dy21             = _fjsp_sub_v2r8(iy2,jy1);
+            dz21             = _fjsp_sub_v2r8(iz2,jz1);
+            dx22             = _fjsp_sub_v2r8(ix2,jx2);
+            dy22             = _fjsp_sub_v2r8(iy2,jy2);
+            dz22             = _fjsp_sub_v2r8(iz2,jz2);
+            dx23             = _fjsp_sub_v2r8(ix2,jx3);
+            dy23             = _fjsp_sub_v2r8(iy2,jy3);
+            dz23             = _fjsp_sub_v2r8(iz2,jz3);
+            dx31             = _fjsp_sub_v2r8(ix3,jx1);
+            dy31             = _fjsp_sub_v2r8(iy3,jy1);
+            dz31             = _fjsp_sub_v2r8(iz3,jz1);
+            dx32             = _fjsp_sub_v2r8(ix3,jx2);
+            dy32             = _fjsp_sub_v2r8(iy3,jy2);
+            dz32             = _fjsp_sub_v2r8(iz3,jz2);
+            dx33             = _fjsp_sub_v2r8(ix3,jx3);
+            dy33             = _fjsp_sub_v2r8(iy3,jy3);
+            dz33             = _fjsp_sub_v2r8(iz3,jz3);
+
+            /* Calculate squared distance and things based on it */
+            rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+            rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+            rsq13            = gmx_fjsp_calc_rsq_v2r8(dx13,dy13,dz13);
+            rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+            rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+            rsq23            = gmx_fjsp_calc_rsq_v2r8(dx23,dy23,dz23);
+            rsq31            = gmx_fjsp_calc_rsq_v2r8(dx31,dy31,dz31);
+            rsq32            = gmx_fjsp_calc_rsq_v2r8(dx32,dy32,dz32);
+            rsq33            = gmx_fjsp_calc_rsq_v2r8(dx33,dy33,dz33);
+
+            rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+            rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+            rinv13           = gmx_fjsp_invsqrt_v2r8(rsq13);
+            rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+            rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+            rinv23           = gmx_fjsp_invsqrt_v2r8(rsq23);
+            rinv31           = gmx_fjsp_invsqrt_v2r8(rsq31);
+            rinv32           = gmx_fjsp_invsqrt_v2r8(rsq32);
+            rinv33           = gmx_fjsp_invsqrt_v2r8(rsq33);
+
+            fjx1             = _fjsp_setzero_v2r8();
+            fjy1             = _fjsp_setzero_v2r8();
+            fjz1             = _fjsp_setzero_v2r8();
+            fjx2             = _fjsp_setzero_v2r8();
+            fjy2             = _fjsp_setzero_v2r8();
+            fjz2             = _fjsp_setzero_v2r8();
+            fjx3             = _fjsp_setzero_v2r8();
+            fjy3             = _fjsp_setzero_v2r8();
+            fjz3             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r11              = _fjsp_mul_v2r8(rsq11,rinv11);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r11,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 4;
+            vfconv.i[1]     *= 4;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq11,FF),_fjsp_mul_v2r8(vftabscale,rinv11)));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+            
+            fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r12              = _fjsp_mul_v2r8(rsq12,rinv12);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r12,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 4;
+            vfconv.i[1]     *= 4;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq12,FF),_fjsp_mul_v2r8(vftabscale,rinv12)));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+            
+            fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r13              = _fjsp_mul_v2r8(rsq13,rinv13);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r13,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 4;
+            vfconv.i[1]     *= 4;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq13,FF),_fjsp_mul_v2r8(vftabscale,rinv13)));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx13,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy13,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz13,fscal,fiz1);
+            
+            fjx3             = _fjsp_madd_v2r8(dx13,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy13,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz13,fscal,fjz3);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r21              = _fjsp_mul_v2r8(rsq21,rinv21);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r21,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 4;
+            vfconv.i[1]     *= 4;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq21,FF),_fjsp_mul_v2r8(vftabscale,rinv21)));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+            
+            fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r22              = _fjsp_mul_v2r8(rsq22,rinv22);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r22,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 4;
+            vfconv.i[1]     *= 4;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq22,FF),_fjsp_mul_v2r8(vftabscale,rinv22)));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+            
+            fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r23              = _fjsp_mul_v2r8(rsq23,rinv23);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r23,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 4;
+            vfconv.i[1]     *= 4;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq23,FF),_fjsp_mul_v2r8(vftabscale,rinv23)));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx23,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy23,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz23,fscal,fiz2);
+            
+            fjx3             = _fjsp_madd_v2r8(dx23,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy23,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz23,fscal,fjz3);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r31              = _fjsp_mul_v2r8(rsq31,rinv31);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r31,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 4;
+            vfconv.i[1]     *= 4;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq31,FF),_fjsp_mul_v2r8(vftabscale,rinv31)));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx31,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy31,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz31,fscal,fiz3);
+            
+            fjx1             = _fjsp_madd_v2r8(dx31,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy31,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz31,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r32              = _fjsp_mul_v2r8(rsq32,rinv32);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r32,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 4;
+            vfconv.i[1]     *= 4;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq32,FF),_fjsp_mul_v2r8(vftabscale,rinv32)));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx32,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy32,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz32,fscal,fiz3);
+            
+            fjx2             = _fjsp_madd_v2r8(dx32,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy32,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz32,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r33              = _fjsp_mul_v2r8(rsq33,rinv33);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r33,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 4;
+            vfconv.i[1]     *= 4;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq33,FF),_fjsp_mul_v2r8(vftabscale,rinv33)));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx33,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy33,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz33,fscal,fiz3);
+            
+            fjx3             = _fjsp_madd_v2r8(dx33,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy33,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz33,fscal,fjz3);
+
+            gmx_fjsp_decrement_3rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA+DIM,f+j_coord_offsetB+DIM,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+
+            /* Inner loop uses 378 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_3rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA+DIM,
+                                              &jx1,&jy1,&jz1,&jx2,&jy2,&jz2,&jx3,&jy3,&jz3);
+
+            /* Calculate displacement vector */
+            dx11             = _fjsp_sub_v2r8(ix1,jx1);
+            dy11             = _fjsp_sub_v2r8(iy1,jy1);
+            dz11             = _fjsp_sub_v2r8(iz1,jz1);
+            dx12             = _fjsp_sub_v2r8(ix1,jx2);
+            dy12             = _fjsp_sub_v2r8(iy1,jy2);
+            dz12             = _fjsp_sub_v2r8(iz1,jz2);
+            dx13             = _fjsp_sub_v2r8(ix1,jx3);
+            dy13             = _fjsp_sub_v2r8(iy1,jy3);
+            dz13             = _fjsp_sub_v2r8(iz1,jz3);
+            dx21             = _fjsp_sub_v2r8(ix2,jx1);
+            dy21             = _fjsp_sub_v2r8(iy2,jy1);
+            dz21             = _fjsp_sub_v2r8(iz2,jz1);
+            dx22             = _fjsp_sub_v2r8(ix2,jx2);
+            dy22             = _fjsp_sub_v2r8(iy2,jy2);
+            dz22             = _fjsp_sub_v2r8(iz2,jz2);
+            dx23             = _fjsp_sub_v2r8(ix2,jx3);
+            dy23             = _fjsp_sub_v2r8(iy2,jy3);
+            dz23             = _fjsp_sub_v2r8(iz2,jz3);
+            dx31             = _fjsp_sub_v2r8(ix3,jx1);
+            dy31             = _fjsp_sub_v2r8(iy3,jy1);
+            dz31             = _fjsp_sub_v2r8(iz3,jz1);
+            dx32             = _fjsp_sub_v2r8(ix3,jx2);
+            dy32             = _fjsp_sub_v2r8(iy3,jy2);
+            dz32             = _fjsp_sub_v2r8(iz3,jz2);
+            dx33             = _fjsp_sub_v2r8(ix3,jx3);
+            dy33             = _fjsp_sub_v2r8(iy3,jy3);
+            dz33             = _fjsp_sub_v2r8(iz3,jz3);
+
+            /* Calculate squared distance and things based on it */
+            rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+            rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+            rsq13            = gmx_fjsp_calc_rsq_v2r8(dx13,dy13,dz13);
+            rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+            rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+            rsq23            = gmx_fjsp_calc_rsq_v2r8(dx23,dy23,dz23);
+            rsq31            = gmx_fjsp_calc_rsq_v2r8(dx31,dy31,dz31);
+            rsq32            = gmx_fjsp_calc_rsq_v2r8(dx32,dy32,dz32);
+            rsq33            = gmx_fjsp_calc_rsq_v2r8(dx33,dy33,dz33);
+
+            rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+            rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+            rinv13           = gmx_fjsp_invsqrt_v2r8(rsq13);
+            rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+            rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+            rinv23           = gmx_fjsp_invsqrt_v2r8(rsq23);
+            rinv31           = gmx_fjsp_invsqrt_v2r8(rsq31);
+            rinv32           = gmx_fjsp_invsqrt_v2r8(rsq32);
+            rinv33           = gmx_fjsp_invsqrt_v2r8(rsq33);
+
+            fjx1             = _fjsp_setzero_v2r8();
+            fjy1             = _fjsp_setzero_v2r8();
+            fjz1             = _fjsp_setzero_v2r8();
+            fjx2             = _fjsp_setzero_v2r8();
+            fjy2             = _fjsp_setzero_v2r8();
+            fjz2             = _fjsp_setzero_v2r8();
+            fjx3             = _fjsp_setzero_v2r8();
+            fjy3             = _fjsp_setzero_v2r8();
+            fjz3             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r11              = _fjsp_mul_v2r8(rsq11,rinv11);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r11,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 4;
+            vfconv.i[1]     *= 4;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq11,FF),_fjsp_mul_v2r8(vftabscale,rinv11)));
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+            
+            fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r12              = _fjsp_mul_v2r8(rsq12,rinv12);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r12,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 4;
+            vfconv.i[1]     *= 4;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq12,FF),_fjsp_mul_v2r8(vftabscale,rinv12)));
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+            
+            fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r13              = _fjsp_mul_v2r8(rsq13,rinv13);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r13,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 4;
+            vfconv.i[1]     *= 4;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq13,FF),_fjsp_mul_v2r8(vftabscale,rinv13)));
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx13,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy13,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz13,fscal,fiz1);
+            
+            fjx3             = _fjsp_madd_v2r8(dx13,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy13,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz13,fscal,fjz3);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r21              = _fjsp_mul_v2r8(rsq21,rinv21);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r21,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 4;
+            vfconv.i[1]     *= 4;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq21,FF),_fjsp_mul_v2r8(vftabscale,rinv21)));
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+            
+            fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r22              = _fjsp_mul_v2r8(rsq22,rinv22);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r22,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 4;
+            vfconv.i[1]     *= 4;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq22,FF),_fjsp_mul_v2r8(vftabscale,rinv22)));
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+            
+            fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r23              = _fjsp_mul_v2r8(rsq23,rinv23);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r23,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 4;
+            vfconv.i[1]     *= 4;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq23,FF),_fjsp_mul_v2r8(vftabscale,rinv23)));
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx23,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy23,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz23,fscal,fiz2);
+            
+            fjx3             = _fjsp_madd_v2r8(dx23,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy23,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz23,fscal,fjz3);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r31              = _fjsp_mul_v2r8(rsq31,rinv31);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r31,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 4;
+            vfconv.i[1]     *= 4;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq31,FF),_fjsp_mul_v2r8(vftabscale,rinv31)));
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx31,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy31,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz31,fscal,fiz3);
+            
+            fjx1             = _fjsp_madd_v2r8(dx31,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy31,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz31,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r32              = _fjsp_mul_v2r8(rsq32,rinv32);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r32,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 4;
+            vfconv.i[1]     *= 4;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq32,FF),_fjsp_mul_v2r8(vftabscale,rinv32)));
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx32,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy32,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz32,fscal,fiz3);
+            
+            fjx2             = _fjsp_madd_v2r8(dx32,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy32,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz32,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r33              = _fjsp_mul_v2r8(rsq33,rinv33);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r33,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 4;
+            vfconv.i[1]     *= 4;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq33,FF),_fjsp_mul_v2r8(vftabscale,rinv33)));
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx33,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy33,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz33,fscal,fiz3);
+            
+            fjx3             = _fjsp_madd_v2r8(dx33,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy33,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz33,fscal,fjz3);
+
+            gmx_fjsp_decrement_3rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA+DIM,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+
+            /* Inner loop uses 378 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_fjsp_update_iforce_3atom_swizzle_v2r8(fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+                                              f+i_coord_offset+DIM,fshift+i_shift_offset);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 18 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W4W4_F,outeriter*18 + inneriter*378);
+}
diff --git a/src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecCoul_VdwCSTab_GeomP1P1_sparc64_hpc_ace_double.c b/src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecCoul_VdwCSTab_GeomP1P1_sparc64_hpc_ace_double.c
new file mode 100644 (file)
index 0000000..6acc0a7
--- /dev/null
@@ -0,0 +1,679 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2012, by the GROMACS development team, led by
+ * David van der Spoel, Berk Hess, Erik Lindahl, and including many
+ * others, as listed in the AUTHORS file in the top-level source
+ * directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*
+ * Note: this file was generated by the GROMACS sparc64_hpc_ace_double kernel generator.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "kernelutil_sparc64_hpc_ace_double.h"
+
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecCoul_VdwCSTab_GeomP1P1_VF_sparc64_hpc_ace_double
+ * Electrostatics interaction: Coulomb
+ * VdW interaction:            CubicSplineTable
+ * Geometry:                   Particle-Particle
+ * Calculate force/pot:        PotentialAndForce
+ */
+void
+nb_kernel_ElecCoul_VdwCSTab_GeomP1P1_VF_sparc64_hpc_ace_double
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwjidx0A,vdwjidx0B;
+    _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
+    _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
+    _fjsp_v2r8       rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF,twovfeps;
+    real             *vftab;
+    _fjsp_v2r8       itab_tmp;
+    _fjsp_v2r8       dummy_mask,cutoff_mask;
+    _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+    _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+    union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+
+    vftab            = kernel_data->table_vdw->data;
+    vftabscale       = gmx_fjsp_set1_v2r8(kernel_data->table_vdw->scale);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_fjsp_load_shift_and_1rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,&ix0,&iy0,&iz0);
+
+        fix0             = _fjsp_setzero_v2r8();
+        fiy0             = _fjsp_setzero_v2r8();
+        fiz0             = _fjsp_setzero_v2r8();
+
+        /* Load parameters for i particles */
+        iq0              = _fjsp_mul_v2r8(facel,gmx_fjsp_load1_v2r8(charge+inr+0));
+        vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+        /* Reset potential sums */
+        velecsum         = _fjsp_setzero_v2r8();
+        vvdwsum          = _fjsp_setzero_v2r8();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_fjsp_load_2real_swizzle_v2r8(charge+jnrA+0,charge+jnrB+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _fjsp_mul_v2r8(iq0,jq0);
+            gmx_fjsp_load_2pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,&c6_00,&c12_00);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r00,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 8;
+            vfconv.i[1]     *= 8;
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq00,rinv00);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq00);
+
+            /* CUBIC SPLINE TABLE DISPERSION */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 2 );
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 2 );
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            vvdw6            = _fjsp_mul_v2r8(c6_00,VV);
+            FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+            fvdw6            = _fjsp_mul_v2r8(c6_00,FF);
+
+            /* CUBIC SPLINE TABLE REPULSION */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 4 );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 4 );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 6 );
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 6 );
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            vvdw12           = _fjsp_mul_v2r8(c12_00,VV);
+            FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+            fvdw12           = _fjsp_mul_v2r8(c12_00,FF);
+            vvdw             = _fjsp_add_v2r8(vvdw12,vvdw6);
+            fvdw             = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_add_v2r8(fvdw6,fvdw12),_fjsp_mul_v2r8(vftabscale,rinv00)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+            vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
+
+            fscal            = _fjsp_add_v2r8(felec,fvdw);
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            gmx_fjsp_decrement_fma_1rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fscal,dx00,dy00,dz00);
+
+            /* Inner loop uses 66 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+
+            /* Load parameters for j particles */
+            jq0              = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),charge+jnrA+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _fjsp_mul_v2r8(iq0,jq0);
+            gmx_fjsp_load_1pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,&c6_00,&c12_00);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r00,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 8;
+            vfconv.i[1]     *= 8;
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq00,rinv00);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq00);
+
+            /* CUBIC SPLINE TABLE DISPERSION */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 2 );
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            vvdw6            = _fjsp_mul_v2r8(c6_00,VV);
+            FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+            fvdw6            = _fjsp_mul_v2r8(c6_00,FF);
+
+            /* CUBIC SPLINE TABLE REPULSION */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 4 );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 6 );
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            vvdw12           = _fjsp_mul_v2r8(c12_00,VV);
+            FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+            fvdw12           = _fjsp_mul_v2r8(c12_00,FF);
+            vvdw             = _fjsp_add_v2r8(vvdw12,vvdw6);
+            fvdw             = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_add_v2r8(fvdw6,fvdw12),_fjsp_mul_v2r8(vftabscale,rinv00)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+            vvdw             = _fjsp_unpacklo_v2r8(vvdw,_fjsp_setzero_v2r8());
+            vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
+
+            fscal            = _fjsp_add_v2r8(felec,fvdw);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            gmx_fjsp_decrement_fma_1rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fscal,dx00,dy00,dz00);
+
+            /* Inner loop uses 66 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_fjsp_update_iforce_1atom_swizzle_v2r8(fix0,fiy0,fiz0,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        ggid                        = gid[iidx];
+        /* Update potential energies */
+        gmx_fjsp_update_1pot_v2r8(velecsum,kernel_data->energygrp_elec+ggid);
+        gmx_fjsp_update_1pot_v2r8(vvdwsum,kernel_data->energygrp_vdw+ggid);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 9 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_VF,outeriter*9 + inneriter*66);
+}
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecCoul_VdwCSTab_GeomP1P1_F_sparc64_hpc_ace_double
+ * Electrostatics interaction: Coulomb
+ * VdW interaction:            CubicSplineTable
+ * Geometry:                   Particle-Particle
+ * Calculate force/pot:        Force
+ */
+void
+nb_kernel_ElecCoul_VdwCSTab_GeomP1P1_F_sparc64_hpc_ace_double
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwjidx0A,vdwjidx0B;
+    _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
+    _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
+    _fjsp_v2r8       rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF,twovfeps;
+    real             *vftab;
+    _fjsp_v2r8       itab_tmp;
+    _fjsp_v2r8       dummy_mask,cutoff_mask;
+    _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+    _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+    union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+
+    vftab            = kernel_data->table_vdw->data;
+    vftabscale       = gmx_fjsp_set1_v2r8(kernel_data->table_vdw->scale);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_fjsp_load_shift_and_1rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,&ix0,&iy0,&iz0);
+
+        fix0             = _fjsp_setzero_v2r8();
+        fiy0             = _fjsp_setzero_v2r8();
+        fiz0             = _fjsp_setzero_v2r8();
+
+        /* Load parameters for i particles */
+        iq0              = _fjsp_mul_v2r8(facel,gmx_fjsp_load1_v2r8(charge+inr+0));
+        vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_fjsp_load_2real_swizzle_v2r8(charge+jnrA+0,charge+jnrB+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _fjsp_mul_v2r8(iq0,jq0);
+            gmx_fjsp_load_2pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,&c6_00,&c12_00);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r00,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 8;
+            vfconv.i[1]     *= 8;
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq00,rinv00);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq00);
+
+            /* CUBIC SPLINE TABLE DISPERSION */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 2 );
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 2 );
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+            FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+            fvdw6            = _fjsp_mul_v2r8(c6_00,FF);
+
+            /* CUBIC SPLINE TABLE REPULSION */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 4 );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 4 );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 6 );
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 6 );
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+            FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+            fvdw12           = _fjsp_mul_v2r8(c12_00,FF);
+            fvdw             = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_add_v2r8(fvdw6,fvdw12),_fjsp_mul_v2r8(vftabscale,rinv00)));
+
+            fscal            = _fjsp_add_v2r8(felec,fvdw);
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            gmx_fjsp_decrement_fma_1rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fscal,dx00,dy00,dz00);
+
+            /* Inner loop uses 57 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+
+            /* Load parameters for j particles */
+            jq0              = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),charge+jnrA+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _fjsp_mul_v2r8(iq0,jq0);
+            gmx_fjsp_load_1pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,&c6_00,&c12_00);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r00,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 8;
+            vfconv.i[1]     *= 8;
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq00,rinv00);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq00);
+
+            /* CUBIC SPLINE TABLE DISPERSION */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 2 );
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+            FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+            fvdw6            = _fjsp_mul_v2r8(c6_00,FF);
+
+            /* CUBIC SPLINE TABLE REPULSION */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 4 );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 6 );
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+            FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+            fvdw12           = _fjsp_mul_v2r8(c12_00,FF);
+            fvdw             = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_add_v2r8(fvdw6,fvdw12),_fjsp_mul_v2r8(vftabscale,rinv00)));
+
+            fscal            = _fjsp_add_v2r8(felec,fvdw);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            gmx_fjsp_decrement_fma_1rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fscal,dx00,dy00,dz00);
+
+            /* Inner loop uses 57 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_fjsp_update_iforce_1atom_swizzle_v2r8(fix0,fiy0,fiz0,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 7 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_F,outeriter*7 + inneriter*57);
+}
diff --git a/src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecCoul_VdwCSTab_GeomW3P1_sparc64_hpc_ace_double.c b/src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecCoul_VdwCSTab_GeomW3P1_sparc64_hpc_ace_double.c
new file mode 100644 (file)
index 0000000..c48a755
--- /dev/null
@@ -0,0 +1,989 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2012, by the GROMACS development team, led by
+ * David van der Spoel, Berk Hess, Erik Lindahl, and including many
+ * others, as listed in the AUTHORS file in the top-level source
+ * directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*
+ * Note: this file was generated by the GROMACS sparc64_hpc_ace_double kernel generator.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "kernelutil_sparc64_hpc_ace_double.h"
+
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecCoul_VdwCSTab_GeomW3P1_VF_sparc64_hpc_ace_double
+ * Electrostatics interaction: Coulomb
+ * VdW interaction:            CubicSplineTable
+ * Geometry:                   Water3-Particle
+ * Calculate force/pot:        PotentialAndForce
+ */
+void
+nb_kernel_ElecCoul_VdwCSTab_GeomW3P1_VF_sparc64_hpc_ace_double
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwjidx0A,vdwjidx0B;
+    _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    _fjsp_v2r8       dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    _fjsp_v2r8       dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
+    _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
+    _fjsp_v2r8       rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF,twovfeps;
+    real             *vftab;
+    _fjsp_v2r8       itab_tmp;
+    _fjsp_v2r8       dummy_mask,cutoff_mask;
+    _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+    _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+    union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+
+    vftab            = kernel_data->table_vdw->data;
+    vftabscale       = gmx_fjsp_set1_v2r8(kernel_data->table_vdw->scale);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq0              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+0]));
+    iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+    iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+    vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_fjsp_load_shift_and_3rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
+
+        fix0             = _fjsp_setzero_v2r8();
+        fiy0             = _fjsp_setzero_v2r8();
+        fiz0             = _fjsp_setzero_v2r8();
+        fix1             = _fjsp_setzero_v2r8();
+        fiy1             = _fjsp_setzero_v2r8();
+        fiz1             = _fjsp_setzero_v2r8();
+        fix2             = _fjsp_setzero_v2r8();
+        fiy2             = _fjsp_setzero_v2r8();
+        fiz2             = _fjsp_setzero_v2r8();
+
+        /* Reset potential sums */
+        velecsum         = _fjsp_setzero_v2r8();
+        vvdwsum          = _fjsp_setzero_v2r8();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+            rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+            rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_fjsp_load_2real_swizzle_v2r8(charge+jnrA+0,charge+jnrB+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _fjsp_mul_v2r8(iq0,jq0);
+            gmx_fjsp_load_2pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,&c6_00,&c12_00);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r00,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 8;
+            vfconv.i[1]     *= 8;
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq00,rinv00);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq00);
+
+            /* CUBIC SPLINE TABLE DISPERSION */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 2 );
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 2 );
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            vvdw6            = _fjsp_mul_v2r8(c6_00,VV);
+            FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+            fvdw6            = _fjsp_mul_v2r8(c6_00,FF);
+
+            /* CUBIC SPLINE TABLE REPULSION */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 4 );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 4 );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 6 );
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 6 );
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            vvdw12           = _fjsp_mul_v2r8(c12_00,VV);
+            FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+            fvdw12           = _fjsp_mul_v2r8(c12_00,FF);
+            vvdw             = _fjsp_add_v2r8(vvdw12,vvdw6);
+            fvdw             = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_add_v2r8(fvdw6,fvdw12),_fjsp_mul_v2r8(vftabscale,rinv00)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+            vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
+
+            fscal            = _fjsp_add_v2r8(felec,fvdw);
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _fjsp_mul_v2r8(iq1,jq0);
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq10,rinv10);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq10);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _fjsp_mul_v2r8(iq2,jq0);
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq20,rinv20);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq20);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            gmx_fjsp_decrement_1rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 131 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+            rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+            rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+
+            /* Load parameters for j particles */
+            jq0              = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),charge+jnrA+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _fjsp_mul_v2r8(iq0,jq0);
+            gmx_fjsp_load_1pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,&c6_00,&c12_00);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r00,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 8;
+            vfconv.i[1]     *= 8;
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq00,rinv00);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq00);
+
+            /* CUBIC SPLINE TABLE DISPERSION */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 2 );
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            vvdw6            = _fjsp_mul_v2r8(c6_00,VV);
+            FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+            fvdw6            = _fjsp_mul_v2r8(c6_00,FF);
+
+            /* CUBIC SPLINE TABLE REPULSION */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 4 );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 6 );
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            vvdw12           = _fjsp_mul_v2r8(c12_00,VV);
+            FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+            fvdw12           = _fjsp_mul_v2r8(c12_00,FF);
+            vvdw             = _fjsp_add_v2r8(vvdw12,vvdw6);
+            fvdw             = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_add_v2r8(fvdw6,fvdw12),_fjsp_mul_v2r8(vftabscale,rinv00)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+            vvdw             = _fjsp_unpacklo_v2r8(vvdw,_fjsp_setzero_v2r8());
+            vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
+
+            fscal            = _fjsp_add_v2r8(felec,fvdw);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _fjsp_mul_v2r8(iq1,jq0);
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq10,rinv10);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq10);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _fjsp_mul_v2r8(iq2,jq0);
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq20,rinv20);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq20);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            gmx_fjsp_decrement_1rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 131 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_fjsp_update_iforce_3atom_swizzle_v2r8(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        ggid                        = gid[iidx];
+        /* Update potential energies */
+        gmx_fjsp_update_1pot_v2r8(velecsum,kernel_data->energygrp_elec+ggid);
+        gmx_fjsp_update_1pot_v2r8(vvdwsum,kernel_data->energygrp_vdw+ggid);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 20 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3_VF,outeriter*20 + inneriter*131);
+}
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecCoul_VdwCSTab_GeomW3P1_F_sparc64_hpc_ace_double
+ * Electrostatics interaction: Coulomb
+ * VdW interaction:            CubicSplineTable
+ * Geometry:                   Water3-Particle
+ * Calculate force/pot:        Force
+ */
+void
+nb_kernel_ElecCoul_VdwCSTab_GeomW3P1_F_sparc64_hpc_ace_double
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwjidx0A,vdwjidx0B;
+    _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    _fjsp_v2r8       dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    _fjsp_v2r8       dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
+    _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
+    _fjsp_v2r8       rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF,twovfeps;
+    real             *vftab;
+    _fjsp_v2r8       itab_tmp;
+    _fjsp_v2r8       dummy_mask,cutoff_mask;
+    _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+    _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+    union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+
+    vftab            = kernel_data->table_vdw->data;
+    vftabscale       = gmx_fjsp_set1_v2r8(kernel_data->table_vdw->scale);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq0              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+0]));
+    iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+    iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+    vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_fjsp_load_shift_and_3rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
+
+        fix0             = _fjsp_setzero_v2r8();
+        fiy0             = _fjsp_setzero_v2r8();
+        fiz0             = _fjsp_setzero_v2r8();
+        fix1             = _fjsp_setzero_v2r8();
+        fiy1             = _fjsp_setzero_v2r8();
+        fiz1             = _fjsp_setzero_v2r8();
+        fix2             = _fjsp_setzero_v2r8();
+        fiy2             = _fjsp_setzero_v2r8();
+        fiz2             = _fjsp_setzero_v2r8();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+            rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+            rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_fjsp_load_2real_swizzle_v2r8(charge+jnrA+0,charge+jnrB+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _fjsp_mul_v2r8(iq0,jq0);
+            gmx_fjsp_load_2pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,&c6_00,&c12_00);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r00,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 8;
+            vfconv.i[1]     *= 8;
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq00,rinv00);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq00);
+
+            /* CUBIC SPLINE TABLE DISPERSION */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 2 );
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 2 );
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+            FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+            fvdw6            = _fjsp_mul_v2r8(c6_00,FF);
+
+            /* CUBIC SPLINE TABLE REPULSION */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 4 );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 4 );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 6 );
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 6 );
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+            FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+            fvdw12           = _fjsp_mul_v2r8(c12_00,FF);
+            fvdw             = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_add_v2r8(fvdw6,fvdw12),_fjsp_mul_v2r8(vftabscale,rinv00)));
+
+            fscal            = _fjsp_add_v2r8(felec,fvdw);
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _fjsp_mul_v2r8(iq1,jq0);
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq10,rinv10);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq10);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _fjsp_mul_v2r8(iq2,jq0);
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq20,rinv20);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq20);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            gmx_fjsp_decrement_1rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 120 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+            rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+            rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+
+            /* Load parameters for j particles */
+            jq0              = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),charge+jnrA+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _fjsp_mul_v2r8(iq0,jq0);
+            gmx_fjsp_load_1pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,&c6_00,&c12_00);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r00,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 8;
+            vfconv.i[1]     *= 8;
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq00,rinv00);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq00);
+
+            /* CUBIC SPLINE TABLE DISPERSION */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 2 );
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+            FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+            fvdw6            = _fjsp_mul_v2r8(c6_00,FF);
+
+            /* CUBIC SPLINE TABLE REPULSION */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 4 );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 6 );
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+            FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+            fvdw12           = _fjsp_mul_v2r8(c12_00,FF);
+            fvdw             = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_add_v2r8(fvdw6,fvdw12),_fjsp_mul_v2r8(vftabscale,rinv00)));
+
+            fscal            = _fjsp_add_v2r8(felec,fvdw);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _fjsp_mul_v2r8(iq1,jq0);
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq10,rinv10);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq10);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _fjsp_mul_v2r8(iq2,jq0);
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq20,rinv20);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq20);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            gmx_fjsp_decrement_1rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 120 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_fjsp_update_iforce_3atom_swizzle_v2r8(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 18 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3_F,outeriter*18 + inneriter*120);
+}
diff --git a/src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecCoul_VdwCSTab_GeomW3W3_sparc64_hpc_ace_double.c b/src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecCoul_VdwCSTab_GeomW3W3_sparc64_hpc_ace_double.c
new file mode 100644 (file)
index 0000000..3786c76
--- /dev/null
@@ -0,0 +1,1671 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2012, by the GROMACS development team, led by
+ * David van der Spoel, Berk Hess, Erik Lindahl, and including many
+ * others, as listed in the AUTHORS file in the top-level source
+ * directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*
+ * Note: this file was generated by the GROMACS sparc64_hpc_ace_double kernel generator.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "kernelutil_sparc64_hpc_ace_double.h"
+
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecCoul_VdwCSTab_GeomW3W3_VF_sparc64_hpc_ace_double
+ * Electrostatics interaction: Coulomb
+ * VdW interaction:            CubicSplineTable
+ * Geometry:                   Water3-Water3
+ * Calculate force/pot:        PotentialAndForce
+ */
+void
+nb_kernel_ElecCoul_VdwCSTab_GeomW3W3_VF_sparc64_hpc_ace_double
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwjidx0A,vdwjidx0B;
+    _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    int              vdwjidx1A,vdwjidx1B;
+    _fjsp_v2r8       jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+    int              vdwjidx2A,vdwjidx2B;
+    _fjsp_v2r8       jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+    _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    _fjsp_v2r8       dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
+    _fjsp_v2r8       dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
+    _fjsp_v2r8       dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    _fjsp_v2r8       dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+    _fjsp_v2r8       dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+    _fjsp_v2r8       dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    _fjsp_v2r8       dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+    _fjsp_v2r8       dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+    _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
+    _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
+    _fjsp_v2r8       rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF,twovfeps;
+    real             *vftab;
+    _fjsp_v2r8       itab_tmp;
+    _fjsp_v2r8       dummy_mask,cutoff_mask;
+    _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+    _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+    union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+
+    vftab            = kernel_data->table_vdw->data;
+    vftabscale       = gmx_fjsp_set1_v2r8(kernel_data->table_vdw->scale);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq0              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+0]));
+    iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+    iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+    vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+    jq0              = gmx_fjsp_set1_v2r8(charge[inr+0]);
+    jq1              = gmx_fjsp_set1_v2r8(charge[inr+1]);
+    jq2              = gmx_fjsp_set1_v2r8(charge[inr+2]);
+    vdwjidx0A        = 2*vdwtype[inr+0];
+    qq00             = _fjsp_mul_v2r8(iq0,jq0);
+    c6_00            = gmx_fjsp_set1_v2r8(vdwparam[vdwioffset0+vdwjidx0A]);
+    c12_00           = gmx_fjsp_set1_v2r8(vdwparam[vdwioffset0+vdwjidx0A+1]);
+    qq01             = _fjsp_mul_v2r8(iq0,jq1);
+    qq02             = _fjsp_mul_v2r8(iq0,jq2);
+    qq10             = _fjsp_mul_v2r8(iq1,jq0);
+    qq11             = _fjsp_mul_v2r8(iq1,jq1);
+    qq12             = _fjsp_mul_v2r8(iq1,jq2);
+    qq20             = _fjsp_mul_v2r8(iq2,jq0);
+    qq21             = _fjsp_mul_v2r8(iq2,jq1);
+    qq22             = _fjsp_mul_v2r8(iq2,jq2);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_fjsp_load_shift_and_3rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
+
+        fix0             = _fjsp_setzero_v2r8();
+        fiy0             = _fjsp_setzero_v2r8();
+        fiz0             = _fjsp_setzero_v2r8();
+        fix1             = _fjsp_setzero_v2r8();
+        fiy1             = _fjsp_setzero_v2r8();
+        fiz1             = _fjsp_setzero_v2r8();
+        fix2             = _fjsp_setzero_v2r8();
+        fiy2             = _fjsp_setzero_v2r8();
+        fiz2             = _fjsp_setzero_v2r8();
+
+        /* Reset potential sums */
+        velecsum         = _fjsp_setzero_v2r8();
+        vvdwsum          = _fjsp_setzero_v2r8();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_3rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx01             = _fjsp_sub_v2r8(ix0,jx1);
+            dy01             = _fjsp_sub_v2r8(iy0,jy1);
+            dz01             = _fjsp_sub_v2r8(iz0,jz1);
+            dx02             = _fjsp_sub_v2r8(ix0,jx2);
+            dy02             = _fjsp_sub_v2r8(iy0,jy2);
+            dz02             = _fjsp_sub_v2r8(iz0,jz2);
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx11             = _fjsp_sub_v2r8(ix1,jx1);
+            dy11             = _fjsp_sub_v2r8(iy1,jy1);
+            dz11             = _fjsp_sub_v2r8(iz1,jz1);
+            dx12             = _fjsp_sub_v2r8(ix1,jx2);
+            dy12             = _fjsp_sub_v2r8(iy1,jy2);
+            dz12             = _fjsp_sub_v2r8(iz1,jz2);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+            dx21             = _fjsp_sub_v2r8(ix2,jx1);
+            dy21             = _fjsp_sub_v2r8(iy2,jy1);
+            dz21             = _fjsp_sub_v2r8(iz2,jz1);
+            dx22             = _fjsp_sub_v2r8(ix2,jx2);
+            dy22             = _fjsp_sub_v2r8(iy2,jy2);
+            dz22             = _fjsp_sub_v2r8(iz2,jz2);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq01            = gmx_fjsp_calc_rsq_v2r8(dx01,dy01,dz01);
+            rsq02            = gmx_fjsp_calc_rsq_v2r8(dx02,dy02,dz02);
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+            rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+            rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+            rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv01           = gmx_fjsp_invsqrt_v2r8(rsq01);
+            rinv02           = gmx_fjsp_invsqrt_v2r8(rsq02);
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+            rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+            rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+            rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+            rinvsq01         = _fjsp_mul_v2r8(rinv01,rinv01);
+            rinvsq02         = _fjsp_mul_v2r8(rinv02,rinv02);
+            rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+            rinvsq11         = _fjsp_mul_v2r8(rinv11,rinv11);
+            rinvsq12         = _fjsp_mul_v2r8(rinv12,rinv12);
+            rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+            rinvsq21         = _fjsp_mul_v2r8(rinv21,rinv21);
+            rinvsq22         = _fjsp_mul_v2r8(rinv22,rinv22);
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+            fjx1             = _fjsp_setzero_v2r8();
+            fjy1             = _fjsp_setzero_v2r8();
+            fjz1             = _fjsp_setzero_v2r8();
+            fjx2             = _fjsp_setzero_v2r8();
+            fjy2             = _fjsp_setzero_v2r8();
+            fjz2             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r00,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 8;
+            vfconv.i[1]     *= 8;
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq00,rinv00);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq00);
+
+            /* CUBIC SPLINE TABLE DISPERSION */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 2 );
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 2 );
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            vvdw6            = _fjsp_mul_v2r8(c6_00,VV);
+            FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+            fvdw6            = _fjsp_mul_v2r8(c6_00,FF);
+
+            /* CUBIC SPLINE TABLE REPULSION */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 4 );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 4 );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 6 );
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 6 );
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            vvdw12           = _fjsp_mul_v2r8(c12_00,VV);
+            FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+            fvdw12           = _fjsp_mul_v2r8(c12_00,FF);
+            vvdw             = _fjsp_add_v2r8(vvdw12,vvdw6);
+            fvdw             = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_add_v2r8(fvdw6,fvdw12),_fjsp_mul_v2r8(vftabscale,rinv00)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+            vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
+
+            fscal            = _fjsp_add_v2r8(felec,fvdw);
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq01,rinv01);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq01);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx01,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy01,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz01,fscal,fiz0);
+            
+            fjx1             = _fjsp_madd_v2r8(dx01,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy01,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz01,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq02,rinv02);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq02);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx02,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy02,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz02,fscal,fiz0);
+            
+            fjx2             = _fjsp_madd_v2r8(dx02,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy02,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz02,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq10,rinv10);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq10);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq11,rinv11);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq11);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+            
+            fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq12,rinv12);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq12);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+            
+            fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq20,rinv20);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq20);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq21,rinv21);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq21);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+            
+            fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq22,rinv22);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq22);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+            
+            fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+
+            gmx_fjsp_decrement_3rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+
+            /* Inner loop uses 314 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_3rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx01             = _fjsp_sub_v2r8(ix0,jx1);
+            dy01             = _fjsp_sub_v2r8(iy0,jy1);
+            dz01             = _fjsp_sub_v2r8(iz0,jz1);
+            dx02             = _fjsp_sub_v2r8(ix0,jx2);
+            dy02             = _fjsp_sub_v2r8(iy0,jy2);
+            dz02             = _fjsp_sub_v2r8(iz0,jz2);
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx11             = _fjsp_sub_v2r8(ix1,jx1);
+            dy11             = _fjsp_sub_v2r8(iy1,jy1);
+            dz11             = _fjsp_sub_v2r8(iz1,jz1);
+            dx12             = _fjsp_sub_v2r8(ix1,jx2);
+            dy12             = _fjsp_sub_v2r8(iy1,jy2);
+            dz12             = _fjsp_sub_v2r8(iz1,jz2);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+            dx21             = _fjsp_sub_v2r8(ix2,jx1);
+            dy21             = _fjsp_sub_v2r8(iy2,jy1);
+            dz21             = _fjsp_sub_v2r8(iz2,jz1);
+            dx22             = _fjsp_sub_v2r8(ix2,jx2);
+            dy22             = _fjsp_sub_v2r8(iy2,jy2);
+            dz22             = _fjsp_sub_v2r8(iz2,jz2);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq01            = gmx_fjsp_calc_rsq_v2r8(dx01,dy01,dz01);
+            rsq02            = gmx_fjsp_calc_rsq_v2r8(dx02,dy02,dz02);
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+            rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+            rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+            rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv01           = gmx_fjsp_invsqrt_v2r8(rsq01);
+            rinv02           = gmx_fjsp_invsqrt_v2r8(rsq02);
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+            rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+            rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+            rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+            rinvsq01         = _fjsp_mul_v2r8(rinv01,rinv01);
+            rinvsq02         = _fjsp_mul_v2r8(rinv02,rinv02);
+            rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+            rinvsq11         = _fjsp_mul_v2r8(rinv11,rinv11);
+            rinvsq12         = _fjsp_mul_v2r8(rinv12,rinv12);
+            rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+            rinvsq21         = _fjsp_mul_v2r8(rinv21,rinv21);
+            rinvsq22         = _fjsp_mul_v2r8(rinv22,rinv22);
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+            fjx1             = _fjsp_setzero_v2r8();
+            fjy1             = _fjsp_setzero_v2r8();
+            fjz1             = _fjsp_setzero_v2r8();
+            fjx2             = _fjsp_setzero_v2r8();
+            fjy2             = _fjsp_setzero_v2r8();
+            fjz2             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r00,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 8;
+            vfconv.i[1]     *= 8;
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq00,rinv00);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq00);
+
+            /* CUBIC SPLINE TABLE DISPERSION */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 2 );
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            vvdw6            = _fjsp_mul_v2r8(c6_00,VV);
+            FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+            fvdw6            = _fjsp_mul_v2r8(c6_00,FF);
+
+            /* CUBIC SPLINE TABLE REPULSION */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 4 );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 6 );
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            vvdw12           = _fjsp_mul_v2r8(c12_00,VV);
+            FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+            fvdw12           = _fjsp_mul_v2r8(c12_00,FF);
+            vvdw             = _fjsp_add_v2r8(vvdw12,vvdw6);
+            fvdw             = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_add_v2r8(fvdw6,fvdw12),_fjsp_mul_v2r8(vftabscale,rinv00)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+            vvdw             = _fjsp_unpacklo_v2r8(vvdw,_fjsp_setzero_v2r8());
+            vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
+
+            fscal            = _fjsp_add_v2r8(felec,fvdw);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq01,rinv01);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq01);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx01,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy01,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz01,fscal,fiz0);
+            
+            fjx1             = _fjsp_madd_v2r8(dx01,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy01,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz01,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq02,rinv02);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq02);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx02,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy02,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz02,fscal,fiz0);
+            
+            fjx2             = _fjsp_madd_v2r8(dx02,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy02,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz02,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq10,rinv10);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq10);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq11,rinv11);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq11);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+            
+            fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq12,rinv12);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq12);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+            
+            fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq20,rinv20);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq20);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq21,rinv21);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq21);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+            
+            fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq22,rinv22);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq22);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+            
+            fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+
+            gmx_fjsp_decrement_3rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+
+            /* Inner loop uses 314 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_fjsp_update_iforce_3atom_swizzle_v2r8(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        ggid                        = gid[iidx];
+        /* Update potential energies */
+        gmx_fjsp_update_1pot_v2r8(velecsum,kernel_data->energygrp_elec+ggid);
+        gmx_fjsp_update_1pot_v2r8(vvdwsum,kernel_data->energygrp_vdw+ggid);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 20 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_VF,outeriter*20 + inneriter*314);
+}
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecCoul_VdwCSTab_GeomW3W3_F_sparc64_hpc_ace_double
+ * Electrostatics interaction: Coulomb
+ * VdW interaction:            CubicSplineTable
+ * Geometry:                   Water3-Water3
+ * Calculate force/pot:        Force
+ */
+void
+nb_kernel_ElecCoul_VdwCSTab_GeomW3W3_F_sparc64_hpc_ace_double
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwjidx0A,vdwjidx0B;
+    _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    int              vdwjidx1A,vdwjidx1B;
+    _fjsp_v2r8       jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+    int              vdwjidx2A,vdwjidx2B;
+    _fjsp_v2r8       jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+    _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    _fjsp_v2r8       dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
+    _fjsp_v2r8       dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
+    _fjsp_v2r8       dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    _fjsp_v2r8       dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+    _fjsp_v2r8       dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+    _fjsp_v2r8       dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    _fjsp_v2r8       dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+    _fjsp_v2r8       dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+    _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
+    _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
+    _fjsp_v2r8       rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF,twovfeps;
+    real             *vftab;
+    _fjsp_v2r8       itab_tmp;
+    _fjsp_v2r8       dummy_mask,cutoff_mask;
+    _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+    _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+    union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+
+    vftab            = kernel_data->table_vdw->data;
+    vftabscale       = gmx_fjsp_set1_v2r8(kernel_data->table_vdw->scale);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq0              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+0]));
+    iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+    iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+    vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+    jq0              = gmx_fjsp_set1_v2r8(charge[inr+0]);
+    jq1              = gmx_fjsp_set1_v2r8(charge[inr+1]);
+    jq2              = gmx_fjsp_set1_v2r8(charge[inr+2]);
+    vdwjidx0A        = 2*vdwtype[inr+0];
+    qq00             = _fjsp_mul_v2r8(iq0,jq0);
+    c6_00            = gmx_fjsp_set1_v2r8(vdwparam[vdwioffset0+vdwjidx0A]);
+    c12_00           = gmx_fjsp_set1_v2r8(vdwparam[vdwioffset0+vdwjidx0A+1]);
+    qq01             = _fjsp_mul_v2r8(iq0,jq1);
+    qq02             = _fjsp_mul_v2r8(iq0,jq2);
+    qq10             = _fjsp_mul_v2r8(iq1,jq0);
+    qq11             = _fjsp_mul_v2r8(iq1,jq1);
+    qq12             = _fjsp_mul_v2r8(iq1,jq2);
+    qq20             = _fjsp_mul_v2r8(iq2,jq0);
+    qq21             = _fjsp_mul_v2r8(iq2,jq1);
+    qq22             = _fjsp_mul_v2r8(iq2,jq2);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_fjsp_load_shift_and_3rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
+
+        fix0             = _fjsp_setzero_v2r8();
+        fiy0             = _fjsp_setzero_v2r8();
+        fiz0             = _fjsp_setzero_v2r8();
+        fix1             = _fjsp_setzero_v2r8();
+        fiy1             = _fjsp_setzero_v2r8();
+        fiz1             = _fjsp_setzero_v2r8();
+        fix2             = _fjsp_setzero_v2r8();
+        fiy2             = _fjsp_setzero_v2r8();
+        fiz2             = _fjsp_setzero_v2r8();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_3rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx01             = _fjsp_sub_v2r8(ix0,jx1);
+            dy01             = _fjsp_sub_v2r8(iy0,jy1);
+            dz01             = _fjsp_sub_v2r8(iz0,jz1);
+            dx02             = _fjsp_sub_v2r8(ix0,jx2);
+            dy02             = _fjsp_sub_v2r8(iy0,jy2);
+            dz02             = _fjsp_sub_v2r8(iz0,jz2);
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx11             = _fjsp_sub_v2r8(ix1,jx1);
+            dy11             = _fjsp_sub_v2r8(iy1,jy1);
+            dz11             = _fjsp_sub_v2r8(iz1,jz1);
+            dx12             = _fjsp_sub_v2r8(ix1,jx2);
+            dy12             = _fjsp_sub_v2r8(iy1,jy2);
+            dz12             = _fjsp_sub_v2r8(iz1,jz2);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+            dx21             = _fjsp_sub_v2r8(ix2,jx1);
+            dy21             = _fjsp_sub_v2r8(iy2,jy1);
+            dz21             = _fjsp_sub_v2r8(iz2,jz1);
+            dx22             = _fjsp_sub_v2r8(ix2,jx2);
+            dy22             = _fjsp_sub_v2r8(iy2,jy2);
+            dz22             = _fjsp_sub_v2r8(iz2,jz2);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq01            = gmx_fjsp_calc_rsq_v2r8(dx01,dy01,dz01);
+            rsq02            = gmx_fjsp_calc_rsq_v2r8(dx02,dy02,dz02);
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+            rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+            rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+            rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv01           = gmx_fjsp_invsqrt_v2r8(rsq01);
+            rinv02           = gmx_fjsp_invsqrt_v2r8(rsq02);
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+            rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+            rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+            rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+            rinvsq01         = _fjsp_mul_v2r8(rinv01,rinv01);
+            rinvsq02         = _fjsp_mul_v2r8(rinv02,rinv02);
+            rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+            rinvsq11         = _fjsp_mul_v2r8(rinv11,rinv11);
+            rinvsq12         = _fjsp_mul_v2r8(rinv12,rinv12);
+            rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+            rinvsq21         = _fjsp_mul_v2r8(rinv21,rinv21);
+            rinvsq22         = _fjsp_mul_v2r8(rinv22,rinv22);
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+            fjx1             = _fjsp_setzero_v2r8();
+            fjy1             = _fjsp_setzero_v2r8();
+            fjz1             = _fjsp_setzero_v2r8();
+            fjx2             = _fjsp_setzero_v2r8();
+            fjy2             = _fjsp_setzero_v2r8();
+            fjz2             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r00,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 8;
+            vfconv.i[1]     *= 8;
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq00,rinv00);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq00);
+
+            /* CUBIC SPLINE TABLE DISPERSION */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 2 );
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 2 );
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+            FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+            fvdw6            = _fjsp_mul_v2r8(c6_00,FF);
+
+            /* CUBIC SPLINE TABLE REPULSION */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 4 );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 4 );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 6 );
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 6 );
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+            FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+            fvdw12           = _fjsp_mul_v2r8(c12_00,FF);
+            fvdw             = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_add_v2r8(fvdw6,fvdw12),_fjsp_mul_v2r8(vftabscale,rinv00)));
+
+            fscal            = _fjsp_add_v2r8(felec,fvdw);
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq01,rinv01);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq01);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx01,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy01,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz01,fscal,fiz0);
+            
+            fjx1             = _fjsp_madd_v2r8(dx01,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy01,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz01,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq02,rinv02);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq02);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx02,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy02,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz02,fscal,fiz0);
+            
+            fjx2             = _fjsp_madd_v2r8(dx02,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy02,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz02,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq10,rinv10);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq10);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq11,rinv11);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq11);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+            
+            fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq12,rinv12);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq12);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+            
+            fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq20,rinv20);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq20);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq21,rinv21);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq21);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+            
+            fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq22,rinv22);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq22);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+            
+            fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+
+            gmx_fjsp_decrement_3rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+
+            /* Inner loop uses 297 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_3rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx01             = _fjsp_sub_v2r8(ix0,jx1);
+            dy01             = _fjsp_sub_v2r8(iy0,jy1);
+            dz01             = _fjsp_sub_v2r8(iz0,jz1);
+            dx02             = _fjsp_sub_v2r8(ix0,jx2);
+            dy02             = _fjsp_sub_v2r8(iy0,jy2);
+            dz02             = _fjsp_sub_v2r8(iz0,jz2);
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx11             = _fjsp_sub_v2r8(ix1,jx1);
+            dy11             = _fjsp_sub_v2r8(iy1,jy1);
+            dz11             = _fjsp_sub_v2r8(iz1,jz1);
+            dx12             = _fjsp_sub_v2r8(ix1,jx2);
+            dy12             = _fjsp_sub_v2r8(iy1,jy2);
+            dz12             = _fjsp_sub_v2r8(iz1,jz2);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+            dx21             = _fjsp_sub_v2r8(ix2,jx1);
+            dy21             = _fjsp_sub_v2r8(iy2,jy1);
+            dz21             = _fjsp_sub_v2r8(iz2,jz1);
+            dx22             = _fjsp_sub_v2r8(ix2,jx2);
+            dy22             = _fjsp_sub_v2r8(iy2,jy2);
+            dz22             = _fjsp_sub_v2r8(iz2,jz2);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq01            = gmx_fjsp_calc_rsq_v2r8(dx01,dy01,dz01);
+            rsq02            = gmx_fjsp_calc_rsq_v2r8(dx02,dy02,dz02);
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+            rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+            rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+            rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv01           = gmx_fjsp_invsqrt_v2r8(rsq01);
+            rinv02           = gmx_fjsp_invsqrt_v2r8(rsq02);
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+            rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+            rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+            rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+            rinvsq01         = _fjsp_mul_v2r8(rinv01,rinv01);
+            rinvsq02         = _fjsp_mul_v2r8(rinv02,rinv02);
+            rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+            rinvsq11         = _fjsp_mul_v2r8(rinv11,rinv11);
+            rinvsq12         = _fjsp_mul_v2r8(rinv12,rinv12);
+            rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+            rinvsq21         = _fjsp_mul_v2r8(rinv21,rinv21);
+            rinvsq22         = _fjsp_mul_v2r8(rinv22,rinv22);
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+            fjx1             = _fjsp_setzero_v2r8();
+            fjy1             = _fjsp_setzero_v2r8();
+            fjz1             = _fjsp_setzero_v2r8();
+            fjx2             = _fjsp_setzero_v2r8();
+            fjy2             = _fjsp_setzero_v2r8();
+            fjz2             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r00,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 8;
+            vfconv.i[1]     *= 8;
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq00,rinv00);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq00);
+
+            /* CUBIC SPLINE TABLE DISPERSION */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 2 );
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+            FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+            fvdw6            = _fjsp_mul_v2r8(c6_00,FF);
+
+            /* CUBIC SPLINE TABLE REPULSION */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 4 );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 6 );
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+            FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+            fvdw12           = _fjsp_mul_v2r8(c12_00,FF);
+            fvdw             = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_add_v2r8(fvdw6,fvdw12),_fjsp_mul_v2r8(vftabscale,rinv00)));
+
+            fscal            = _fjsp_add_v2r8(felec,fvdw);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq01,rinv01);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq01);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx01,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy01,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz01,fscal,fiz0);
+            
+            fjx1             = _fjsp_madd_v2r8(dx01,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy01,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz01,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq02,rinv02);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq02);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx02,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy02,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz02,fscal,fiz0);
+            
+            fjx2             = _fjsp_madd_v2r8(dx02,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy02,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz02,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq10,rinv10);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq10);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq11,rinv11);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq11);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+            
+            fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq12,rinv12);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq12);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+            
+            fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq20,rinv20);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq20);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq21,rinv21);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq21);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+            
+            fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq22,rinv22);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq22);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+            
+            fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+
+            gmx_fjsp_decrement_3rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+
+            /* Inner loop uses 297 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_fjsp_update_iforce_3atom_swizzle_v2r8(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 18 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_F,outeriter*18 + inneriter*297);
+}
diff --git a/src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecCoul_VdwCSTab_GeomW4P1_sparc64_hpc_ace_double.c b/src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecCoul_VdwCSTab_GeomW4P1_sparc64_hpc_ace_double.c
new file mode 100644 (file)
index 0000000..d299daf
--- /dev/null
@@ -0,0 +1,1097 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2012, by the GROMACS development team, led by
+ * David van der Spoel, Berk Hess, Erik Lindahl, and including many
+ * others, as listed in the AUTHORS file in the top-level source
+ * directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*
+ * Note: this file was generated by the GROMACS sparc64_hpc_ace_double kernel generator.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "kernelutil_sparc64_hpc_ace_double.h"
+
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecCoul_VdwCSTab_GeomW4P1_VF_sparc64_hpc_ace_double
+ * Electrostatics interaction: Coulomb
+ * VdW interaction:            CubicSplineTable
+ * Geometry:                   Water4-Particle
+ * Calculate force/pot:        PotentialAndForce
+ */
+void
+nb_kernel_ElecCoul_VdwCSTab_GeomW4P1_VF_sparc64_hpc_ace_double
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwioffset3;
+    _fjsp_v2r8       ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+    int              vdwjidx0A,vdwjidx0B;
+    _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    _fjsp_v2r8       dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    _fjsp_v2r8       dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    _fjsp_v2r8       dx30,dy30,dz30,rsq30,rinv30,rinvsq30,r30,qq30,c6_30,c12_30;
+    _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
+    _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
+    _fjsp_v2r8       rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF,twovfeps;
+    real             *vftab;
+    _fjsp_v2r8       itab_tmp;
+    _fjsp_v2r8       dummy_mask,cutoff_mask;
+    _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+    _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+    union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+
+    vftab            = kernel_data->table_vdw->data;
+    vftabscale       = gmx_fjsp_set1_v2r8(kernel_data->table_vdw->scale);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+    iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+    iq3              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+3]));
+    vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_fjsp_load_shift_and_4rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
+
+        fix0             = _fjsp_setzero_v2r8();
+        fiy0             = _fjsp_setzero_v2r8();
+        fiz0             = _fjsp_setzero_v2r8();
+        fix1             = _fjsp_setzero_v2r8();
+        fiy1             = _fjsp_setzero_v2r8();
+        fiz1             = _fjsp_setzero_v2r8();
+        fix2             = _fjsp_setzero_v2r8();
+        fiy2             = _fjsp_setzero_v2r8();
+        fiz2             = _fjsp_setzero_v2r8();
+        fix3             = _fjsp_setzero_v2r8();
+        fiy3             = _fjsp_setzero_v2r8();
+        fiz3             = _fjsp_setzero_v2r8();
+
+        /* Reset potential sums */
+        velecsum         = _fjsp_setzero_v2r8();
+        vvdwsum          = _fjsp_setzero_v2r8();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+            dx30             = _fjsp_sub_v2r8(ix3,jx0);
+            dy30             = _fjsp_sub_v2r8(iy3,jy0);
+            dz30             = _fjsp_sub_v2r8(iz3,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+            rsq30            = gmx_fjsp_calc_rsq_v2r8(dx30,dy30,dz30);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+            rinv30           = gmx_fjsp_invsqrt_v2r8(rsq30);
+
+            rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+            rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+            rinvsq30         = _fjsp_mul_v2r8(rinv30,rinv30);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_fjsp_load_2real_swizzle_v2r8(charge+jnrA+0,charge+jnrB+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            gmx_fjsp_load_2pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,&c6_00,&c12_00);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r00,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 8;
+            vfconv.i[1]     *= 8;
+
+            /* CUBIC SPLINE TABLE DISPERSION */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 2 );
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 2 );
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            vvdw6            = _fjsp_mul_v2r8(c6_00,VV);
+            FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+            fvdw6            = _fjsp_mul_v2r8(c6_00,FF);
+
+            /* CUBIC SPLINE TABLE REPULSION */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 4 );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 4 );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 6 );
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 6 );
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            vvdw12           = _fjsp_mul_v2r8(c12_00,VV);
+            FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+            fvdw12           = _fjsp_mul_v2r8(c12_00,FF);
+            vvdw             = _fjsp_add_v2r8(vvdw12,vvdw6);
+            fvdw             = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_add_v2r8(fvdw6,fvdw12),_fjsp_mul_v2r8(vftabscale,rinv00)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
+
+            fscal            = fvdw;
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _fjsp_mul_v2r8(iq1,jq0);
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq10,rinv10);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq10);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _fjsp_mul_v2r8(iq2,jq0);
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq20,rinv20);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq20);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq30             = _fjsp_mul_v2r8(iq3,jq0);
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq30,rinv30);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq30);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx30,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy30,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz30,fscal,fiz3);
+            
+            fjx0             = _fjsp_madd_v2r8(dx30,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy30,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz30,fscal,fjz0);
+
+            gmx_fjsp_decrement_1rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 155 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+            dx30             = _fjsp_sub_v2r8(ix3,jx0);
+            dy30             = _fjsp_sub_v2r8(iy3,jy0);
+            dz30             = _fjsp_sub_v2r8(iz3,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+            rsq30            = gmx_fjsp_calc_rsq_v2r8(dx30,dy30,dz30);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+            rinv30           = gmx_fjsp_invsqrt_v2r8(rsq30);
+
+            rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+            rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+            rinvsq30         = _fjsp_mul_v2r8(rinv30,rinv30);
+
+            /* Load parameters for j particles */
+            jq0              = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),charge+jnrA+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            gmx_fjsp_load_1pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,&c6_00,&c12_00);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r00,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 8;
+            vfconv.i[1]     *= 8;
+
+            /* CUBIC SPLINE TABLE DISPERSION */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 2 );
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            vvdw6            = _fjsp_mul_v2r8(c6_00,VV);
+            FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+            fvdw6            = _fjsp_mul_v2r8(c6_00,FF);
+
+            /* CUBIC SPLINE TABLE REPULSION */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 4 );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 6 );
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            vvdw12           = _fjsp_mul_v2r8(c12_00,VV);
+            FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+            fvdw12           = _fjsp_mul_v2r8(c12_00,FF);
+            vvdw             = _fjsp_add_v2r8(vvdw12,vvdw6);
+            fvdw             = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_add_v2r8(fvdw6,fvdw12),_fjsp_mul_v2r8(vftabscale,rinv00)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            vvdw             = _fjsp_unpacklo_v2r8(vvdw,_fjsp_setzero_v2r8());
+            vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
+
+            fscal            = fvdw;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _fjsp_mul_v2r8(iq1,jq0);
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq10,rinv10);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq10);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _fjsp_mul_v2r8(iq2,jq0);
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq20,rinv20);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq20);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq30             = _fjsp_mul_v2r8(iq3,jq0);
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq30,rinv30);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq30);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx30,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy30,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz30,fscal,fiz3);
+            
+            fjx0             = _fjsp_madd_v2r8(dx30,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy30,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz30,fscal,fjz0);
+
+            gmx_fjsp_decrement_1rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 155 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_fjsp_update_iforce_4atom_swizzle_v2r8(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        ggid                        = gid[iidx];
+        /* Update potential energies */
+        gmx_fjsp_update_1pot_v2r8(velecsum,kernel_data->energygrp_elec+ggid);
+        gmx_fjsp_update_1pot_v2r8(vvdwsum,kernel_data->energygrp_vdw+ggid);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 26 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4_VF,outeriter*26 + inneriter*155);
+}
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecCoul_VdwCSTab_GeomW4P1_F_sparc64_hpc_ace_double
+ * Electrostatics interaction: Coulomb
+ * VdW interaction:            CubicSplineTable
+ * Geometry:                   Water4-Particle
+ * Calculate force/pot:        Force
+ */
+void
+nb_kernel_ElecCoul_VdwCSTab_GeomW4P1_F_sparc64_hpc_ace_double
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwioffset3;
+    _fjsp_v2r8       ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+    int              vdwjidx0A,vdwjidx0B;
+    _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    _fjsp_v2r8       dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    _fjsp_v2r8       dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    _fjsp_v2r8       dx30,dy30,dz30,rsq30,rinv30,rinvsq30,r30,qq30,c6_30,c12_30;
+    _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
+    _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
+    _fjsp_v2r8       rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF,twovfeps;
+    real             *vftab;
+    _fjsp_v2r8       itab_tmp;
+    _fjsp_v2r8       dummy_mask,cutoff_mask;
+    _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+    _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+    union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+
+    vftab            = kernel_data->table_vdw->data;
+    vftabscale       = gmx_fjsp_set1_v2r8(kernel_data->table_vdw->scale);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+    iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+    iq3              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+3]));
+    vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_fjsp_load_shift_and_4rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
+
+        fix0             = _fjsp_setzero_v2r8();
+        fiy0             = _fjsp_setzero_v2r8();
+        fiz0             = _fjsp_setzero_v2r8();
+        fix1             = _fjsp_setzero_v2r8();
+        fiy1             = _fjsp_setzero_v2r8();
+        fiz1             = _fjsp_setzero_v2r8();
+        fix2             = _fjsp_setzero_v2r8();
+        fiy2             = _fjsp_setzero_v2r8();
+        fiz2             = _fjsp_setzero_v2r8();
+        fix3             = _fjsp_setzero_v2r8();
+        fiy3             = _fjsp_setzero_v2r8();
+        fiz3             = _fjsp_setzero_v2r8();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+            dx30             = _fjsp_sub_v2r8(ix3,jx0);
+            dy30             = _fjsp_sub_v2r8(iy3,jy0);
+            dz30             = _fjsp_sub_v2r8(iz3,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+            rsq30            = gmx_fjsp_calc_rsq_v2r8(dx30,dy30,dz30);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+            rinv30           = gmx_fjsp_invsqrt_v2r8(rsq30);
+
+            rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+            rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+            rinvsq30         = _fjsp_mul_v2r8(rinv30,rinv30);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_fjsp_load_2real_swizzle_v2r8(charge+jnrA+0,charge+jnrB+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            gmx_fjsp_load_2pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,&c6_00,&c12_00);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r00,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 8;
+            vfconv.i[1]     *= 8;
+
+            /* CUBIC SPLINE TABLE DISPERSION */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 2 );
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 2 );
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+            FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+            fvdw6            = _fjsp_mul_v2r8(c6_00,FF);
+
+            /* CUBIC SPLINE TABLE REPULSION */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 4 );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 4 );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 6 );
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 6 );
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+            FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+            fvdw12           = _fjsp_mul_v2r8(c12_00,FF);
+            fvdw             = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_add_v2r8(fvdw6,fvdw12),_fjsp_mul_v2r8(vftabscale,rinv00)));
+
+            fscal            = fvdw;
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _fjsp_mul_v2r8(iq1,jq0);
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq10,rinv10);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq10);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _fjsp_mul_v2r8(iq2,jq0);
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq20,rinv20);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq20);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq30             = _fjsp_mul_v2r8(iq3,jq0);
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq30,rinv30);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq30);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx30,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy30,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz30,fscal,fiz3);
+            
+            fjx0             = _fjsp_madd_v2r8(dx30,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy30,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz30,fscal,fjz0);
+
+            gmx_fjsp_decrement_1rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 144 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+            dx30             = _fjsp_sub_v2r8(ix3,jx0);
+            dy30             = _fjsp_sub_v2r8(iy3,jy0);
+            dz30             = _fjsp_sub_v2r8(iz3,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+            rsq30            = gmx_fjsp_calc_rsq_v2r8(dx30,dy30,dz30);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+            rinv30           = gmx_fjsp_invsqrt_v2r8(rsq30);
+
+            rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+            rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+            rinvsq30         = _fjsp_mul_v2r8(rinv30,rinv30);
+
+            /* Load parameters for j particles */
+            jq0              = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),charge+jnrA+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            gmx_fjsp_load_1pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,&c6_00,&c12_00);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r00,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 8;
+            vfconv.i[1]     *= 8;
+
+            /* CUBIC SPLINE TABLE DISPERSION */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 2 );
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+            FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+            fvdw6            = _fjsp_mul_v2r8(c6_00,FF);
+
+            /* CUBIC SPLINE TABLE REPULSION */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 4 );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 6 );
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+            FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+            fvdw12           = _fjsp_mul_v2r8(c12_00,FF);
+            fvdw             = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_add_v2r8(fvdw6,fvdw12),_fjsp_mul_v2r8(vftabscale,rinv00)));
+
+            fscal            = fvdw;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _fjsp_mul_v2r8(iq1,jq0);
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq10,rinv10);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq10);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _fjsp_mul_v2r8(iq2,jq0);
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq20,rinv20);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq20);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq30             = _fjsp_mul_v2r8(iq3,jq0);
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq30,rinv30);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq30);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx30,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy30,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz30,fscal,fiz3);
+            
+            fjx0             = _fjsp_madd_v2r8(dx30,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy30,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz30,fscal,fjz0);
+
+            gmx_fjsp_decrement_1rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 144 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_fjsp_update_iforce_4atom_swizzle_v2r8(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 24 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4_F,outeriter*24 + inneriter*144);
+}
diff --git a/src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecCoul_VdwCSTab_GeomW4W4_sparc64_hpc_ace_double.c b/src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecCoul_VdwCSTab_GeomW4W4_sparc64_hpc_ace_double.c
new file mode 100644 (file)
index 0000000..54f9d59
--- /dev/null
@@ -0,0 +1,1791 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2012, by the GROMACS development team, led by
+ * David van der Spoel, Berk Hess, Erik Lindahl, and including many
+ * others, as listed in the AUTHORS file in the top-level source
+ * directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*
+ * Note: this file was generated by the GROMACS sparc64_hpc_ace_double kernel generator.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "kernelutil_sparc64_hpc_ace_double.h"
+
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecCoul_VdwCSTab_GeomW4W4_VF_sparc64_hpc_ace_double
+ * Electrostatics interaction: Coulomb
+ * VdW interaction:            CubicSplineTable
+ * Geometry:                   Water4-Water4
+ * Calculate force/pot:        PotentialAndForce
+ */
+void
+nb_kernel_ElecCoul_VdwCSTab_GeomW4W4_VF_sparc64_hpc_ace_double
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwioffset3;
+    _fjsp_v2r8       ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+    int              vdwjidx0A,vdwjidx0B;
+    _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    int              vdwjidx1A,vdwjidx1B;
+    _fjsp_v2r8       jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+    int              vdwjidx2A,vdwjidx2B;
+    _fjsp_v2r8       jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+    int              vdwjidx3A,vdwjidx3B;
+    _fjsp_v2r8       jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
+    _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    _fjsp_v2r8       dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+    _fjsp_v2r8       dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+    _fjsp_v2r8       dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
+    _fjsp_v2r8       dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+    _fjsp_v2r8       dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+    _fjsp_v2r8       dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
+    _fjsp_v2r8       dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
+    _fjsp_v2r8       dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
+    _fjsp_v2r8       dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
+    _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
+    _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
+    _fjsp_v2r8       rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF,twovfeps;
+    real             *vftab;
+    _fjsp_v2r8       itab_tmp;
+    _fjsp_v2r8       dummy_mask,cutoff_mask;
+    _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+    _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+    union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+
+    vftab            = kernel_data->table_vdw->data;
+    vftabscale       = gmx_fjsp_set1_v2r8(kernel_data->table_vdw->scale);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+    iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+    iq3              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+3]));
+    vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+    jq1              = gmx_fjsp_set1_v2r8(charge[inr+1]);
+    jq2              = gmx_fjsp_set1_v2r8(charge[inr+2]);
+    jq3              = gmx_fjsp_set1_v2r8(charge[inr+3]);
+    vdwjidx0A        = 2*vdwtype[inr+0];
+    c6_00            = gmx_fjsp_set1_v2r8(vdwparam[vdwioffset0+vdwjidx0A]);
+    c12_00           = gmx_fjsp_set1_v2r8(vdwparam[vdwioffset0+vdwjidx0A+1]);
+    qq11             = _fjsp_mul_v2r8(iq1,jq1);
+    qq12             = _fjsp_mul_v2r8(iq1,jq2);
+    qq13             = _fjsp_mul_v2r8(iq1,jq3);
+    qq21             = _fjsp_mul_v2r8(iq2,jq1);
+    qq22             = _fjsp_mul_v2r8(iq2,jq2);
+    qq23             = _fjsp_mul_v2r8(iq2,jq3);
+    qq31             = _fjsp_mul_v2r8(iq3,jq1);
+    qq32             = _fjsp_mul_v2r8(iq3,jq2);
+    qq33             = _fjsp_mul_v2r8(iq3,jq3);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_fjsp_load_shift_and_4rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
+
+        fix0             = _fjsp_setzero_v2r8();
+        fiy0             = _fjsp_setzero_v2r8();
+        fiz0             = _fjsp_setzero_v2r8();
+        fix1             = _fjsp_setzero_v2r8();
+        fiy1             = _fjsp_setzero_v2r8();
+        fiz1             = _fjsp_setzero_v2r8();
+        fix2             = _fjsp_setzero_v2r8();
+        fiy2             = _fjsp_setzero_v2r8();
+        fiz2             = _fjsp_setzero_v2r8();
+        fix3             = _fjsp_setzero_v2r8();
+        fiy3             = _fjsp_setzero_v2r8();
+        fiz3             = _fjsp_setzero_v2r8();
+
+        /* Reset potential sums */
+        velecsum         = _fjsp_setzero_v2r8();
+        vvdwsum          = _fjsp_setzero_v2r8();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_4rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
+                                              &jy2,&jz2,&jx3,&jy3,&jz3);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx11             = _fjsp_sub_v2r8(ix1,jx1);
+            dy11             = _fjsp_sub_v2r8(iy1,jy1);
+            dz11             = _fjsp_sub_v2r8(iz1,jz1);
+            dx12             = _fjsp_sub_v2r8(ix1,jx2);
+            dy12             = _fjsp_sub_v2r8(iy1,jy2);
+            dz12             = _fjsp_sub_v2r8(iz1,jz2);
+            dx13             = _fjsp_sub_v2r8(ix1,jx3);
+            dy13             = _fjsp_sub_v2r8(iy1,jy3);
+            dz13             = _fjsp_sub_v2r8(iz1,jz3);
+            dx21             = _fjsp_sub_v2r8(ix2,jx1);
+            dy21             = _fjsp_sub_v2r8(iy2,jy1);
+            dz21             = _fjsp_sub_v2r8(iz2,jz1);
+            dx22             = _fjsp_sub_v2r8(ix2,jx2);
+            dy22             = _fjsp_sub_v2r8(iy2,jy2);
+            dz22             = _fjsp_sub_v2r8(iz2,jz2);
+            dx23             = _fjsp_sub_v2r8(ix2,jx3);
+            dy23             = _fjsp_sub_v2r8(iy2,jy3);
+            dz23             = _fjsp_sub_v2r8(iz2,jz3);
+            dx31             = _fjsp_sub_v2r8(ix3,jx1);
+            dy31             = _fjsp_sub_v2r8(iy3,jy1);
+            dz31             = _fjsp_sub_v2r8(iz3,jz1);
+            dx32             = _fjsp_sub_v2r8(ix3,jx2);
+            dy32             = _fjsp_sub_v2r8(iy3,jy2);
+            dz32             = _fjsp_sub_v2r8(iz3,jz2);
+            dx33             = _fjsp_sub_v2r8(ix3,jx3);
+            dy33             = _fjsp_sub_v2r8(iy3,jy3);
+            dz33             = _fjsp_sub_v2r8(iz3,jz3);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+            rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+            rsq13            = gmx_fjsp_calc_rsq_v2r8(dx13,dy13,dz13);
+            rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+            rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+            rsq23            = gmx_fjsp_calc_rsq_v2r8(dx23,dy23,dz23);
+            rsq31            = gmx_fjsp_calc_rsq_v2r8(dx31,dy31,dz31);
+            rsq32            = gmx_fjsp_calc_rsq_v2r8(dx32,dy32,dz32);
+            rsq33            = gmx_fjsp_calc_rsq_v2r8(dx33,dy33,dz33);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+            rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+            rinv13           = gmx_fjsp_invsqrt_v2r8(rsq13);
+            rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+            rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+            rinv23           = gmx_fjsp_invsqrt_v2r8(rsq23);
+            rinv31           = gmx_fjsp_invsqrt_v2r8(rsq31);
+            rinv32           = gmx_fjsp_invsqrt_v2r8(rsq32);
+            rinv33           = gmx_fjsp_invsqrt_v2r8(rsq33);
+
+            rinvsq11         = _fjsp_mul_v2r8(rinv11,rinv11);
+            rinvsq12         = _fjsp_mul_v2r8(rinv12,rinv12);
+            rinvsq13         = _fjsp_mul_v2r8(rinv13,rinv13);
+            rinvsq21         = _fjsp_mul_v2r8(rinv21,rinv21);
+            rinvsq22         = _fjsp_mul_v2r8(rinv22,rinv22);
+            rinvsq23         = _fjsp_mul_v2r8(rinv23,rinv23);
+            rinvsq31         = _fjsp_mul_v2r8(rinv31,rinv31);
+            rinvsq32         = _fjsp_mul_v2r8(rinv32,rinv32);
+            rinvsq33         = _fjsp_mul_v2r8(rinv33,rinv33);
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+            fjx1             = _fjsp_setzero_v2r8();
+            fjy1             = _fjsp_setzero_v2r8();
+            fjz1             = _fjsp_setzero_v2r8();
+            fjx2             = _fjsp_setzero_v2r8();
+            fjy2             = _fjsp_setzero_v2r8();
+            fjz2             = _fjsp_setzero_v2r8();
+            fjx3             = _fjsp_setzero_v2r8();
+            fjy3             = _fjsp_setzero_v2r8();
+            fjz3             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r00,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 8;
+            vfconv.i[1]     *= 8;
+
+            /* CUBIC SPLINE TABLE DISPERSION */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 2 );
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 2 );
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            vvdw6            = _fjsp_mul_v2r8(c6_00,VV);
+            FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+            fvdw6            = _fjsp_mul_v2r8(c6_00,FF);
+
+            /* CUBIC SPLINE TABLE REPULSION */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 4 );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 4 );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 6 );
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 6 );
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            vvdw12           = _fjsp_mul_v2r8(c12_00,VV);
+            FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+            fvdw12           = _fjsp_mul_v2r8(c12_00,FF);
+            vvdw             = _fjsp_add_v2r8(vvdw12,vvdw6);
+            fvdw             = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_add_v2r8(fvdw6,fvdw12),_fjsp_mul_v2r8(vftabscale,rinv00)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
+
+            fscal            = fvdw;
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq11,rinv11);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq11);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+            
+            fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq12,rinv12);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq12);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+            
+            fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq13,rinv13);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq13);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx13,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy13,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz13,fscal,fiz1);
+            
+            fjx3             = _fjsp_madd_v2r8(dx13,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy13,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz13,fscal,fjz3);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq21,rinv21);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq21);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+            
+            fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq22,rinv22);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq22);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+            
+            fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq23,rinv23);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq23);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx23,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy23,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz23,fscal,fiz2);
+            
+            fjx3             = _fjsp_madd_v2r8(dx23,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy23,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz23,fscal,fjz3);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq31,rinv31);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq31);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx31,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy31,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz31,fscal,fiz3);
+            
+            fjx1             = _fjsp_madd_v2r8(dx31,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy31,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz31,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq32,rinv32);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq32);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx32,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy32,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz32,fscal,fiz3);
+            
+            fjx2             = _fjsp_madd_v2r8(dx32,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy32,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz32,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq33,rinv33);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq33);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx33,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy33,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz33,fscal,fiz3);
+            
+            fjx3             = _fjsp_madd_v2r8(dx33,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy33,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz33,fscal,fjz3);
+
+            gmx_fjsp_decrement_4rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+
+            /* Inner loop uses 341 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_4rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
+                                              &jy2,&jz2,&jx3,&jy3,&jz3);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx11             = _fjsp_sub_v2r8(ix1,jx1);
+            dy11             = _fjsp_sub_v2r8(iy1,jy1);
+            dz11             = _fjsp_sub_v2r8(iz1,jz1);
+            dx12             = _fjsp_sub_v2r8(ix1,jx2);
+            dy12             = _fjsp_sub_v2r8(iy1,jy2);
+            dz12             = _fjsp_sub_v2r8(iz1,jz2);
+            dx13             = _fjsp_sub_v2r8(ix1,jx3);
+            dy13             = _fjsp_sub_v2r8(iy1,jy3);
+            dz13             = _fjsp_sub_v2r8(iz1,jz3);
+            dx21             = _fjsp_sub_v2r8(ix2,jx1);
+            dy21             = _fjsp_sub_v2r8(iy2,jy1);
+            dz21             = _fjsp_sub_v2r8(iz2,jz1);
+            dx22             = _fjsp_sub_v2r8(ix2,jx2);
+            dy22             = _fjsp_sub_v2r8(iy2,jy2);
+            dz22             = _fjsp_sub_v2r8(iz2,jz2);
+            dx23             = _fjsp_sub_v2r8(ix2,jx3);
+            dy23             = _fjsp_sub_v2r8(iy2,jy3);
+            dz23             = _fjsp_sub_v2r8(iz2,jz3);
+            dx31             = _fjsp_sub_v2r8(ix3,jx1);
+            dy31             = _fjsp_sub_v2r8(iy3,jy1);
+            dz31             = _fjsp_sub_v2r8(iz3,jz1);
+            dx32             = _fjsp_sub_v2r8(ix3,jx2);
+            dy32             = _fjsp_sub_v2r8(iy3,jy2);
+            dz32             = _fjsp_sub_v2r8(iz3,jz2);
+            dx33             = _fjsp_sub_v2r8(ix3,jx3);
+            dy33             = _fjsp_sub_v2r8(iy3,jy3);
+            dz33             = _fjsp_sub_v2r8(iz3,jz3);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+            rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+            rsq13            = gmx_fjsp_calc_rsq_v2r8(dx13,dy13,dz13);
+            rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+            rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+            rsq23            = gmx_fjsp_calc_rsq_v2r8(dx23,dy23,dz23);
+            rsq31            = gmx_fjsp_calc_rsq_v2r8(dx31,dy31,dz31);
+            rsq32            = gmx_fjsp_calc_rsq_v2r8(dx32,dy32,dz32);
+            rsq33            = gmx_fjsp_calc_rsq_v2r8(dx33,dy33,dz33);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+            rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+            rinv13           = gmx_fjsp_invsqrt_v2r8(rsq13);
+            rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+            rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+            rinv23           = gmx_fjsp_invsqrt_v2r8(rsq23);
+            rinv31           = gmx_fjsp_invsqrt_v2r8(rsq31);
+            rinv32           = gmx_fjsp_invsqrt_v2r8(rsq32);
+            rinv33           = gmx_fjsp_invsqrt_v2r8(rsq33);
+
+            rinvsq11         = _fjsp_mul_v2r8(rinv11,rinv11);
+            rinvsq12         = _fjsp_mul_v2r8(rinv12,rinv12);
+            rinvsq13         = _fjsp_mul_v2r8(rinv13,rinv13);
+            rinvsq21         = _fjsp_mul_v2r8(rinv21,rinv21);
+            rinvsq22         = _fjsp_mul_v2r8(rinv22,rinv22);
+            rinvsq23         = _fjsp_mul_v2r8(rinv23,rinv23);
+            rinvsq31         = _fjsp_mul_v2r8(rinv31,rinv31);
+            rinvsq32         = _fjsp_mul_v2r8(rinv32,rinv32);
+            rinvsq33         = _fjsp_mul_v2r8(rinv33,rinv33);
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+            fjx1             = _fjsp_setzero_v2r8();
+            fjy1             = _fjsp_setzero_v2r8();
+            fjz1             = _fjsp_setzero_v2r8();
+            fjx2             = _fjsp_setzero_v2r8();
+            fjy2             = _fjsp_setzero_v2r8();
+            fjz2             = _fjsp_setzero_v2r8();
+            fjx3             = _fjsp_setzero_v2r8();
+            fjy3             = _fjsp_setzero_v2r8();
+            fjz3             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r00,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 8;
+            vfconv.i[1]     *= 8;
+
+            /* CUBIC SPLINE TABLE DISPERSION */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 2 );
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            vvdw6            = _fjsp_mul_v2r8(c6_00,VV);
+            FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+            fvdw6            = _fjsp_mul_v2r8(c6_00,FF);
+
+            /* CUBIC SPLINE TABLE REPULSION */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 4 );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 6 );
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            vvdw12           = _fjsp_mul_v2r8(c12_00,VV);
+            FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+            fvdw12           = _fjsp_mul_v2r8(c12_00,FF);
+            vvdw             = _fjsp_add_v2r8(vvdw12,vvdw6);
+            fvdw             = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_add_v2r8(fvdw6,fvdw12),_fjsp_mul_v2r8(vftabscale,rinv00)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            vvdw             = _fjsp_unpacklo_v2r8(vvdw,_fjsp_setzero_v2r8());
+            vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
+
+            fscal            = fvdw;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq11,rinv11);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq11);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+            
+            fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq12,rinv12);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq12);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+            
+            fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq13,rinv13);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq13);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx13,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy13,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz13,fscal,fiz1);
+            
+            fjx3             = _fjsp_madd_v2r8(dx13,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy13,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz13,fscal,fjz3);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq21,rinv21);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq21);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+            
+            fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq22,rinv22);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq22);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+            
+            fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq23,rinv23);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq23);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx23,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy23,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz23,fscal,fiz2);
+            
+            fjx3             = _fjsp_madd_v2r8(dx23,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy23,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz23,fscal,fjz3);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq31,rinv31);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq31);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx31,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy31,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz31,fscal,fiz3);
+            
+            fjx1             = _fjsp_madd_v2r8(dx31,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy31,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz31,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq32,rinv32);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq32);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx32,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy32,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz32,fscal,fiz3);
+            
+            fjx2             = _fjsp_madd_v2r8(dx32,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy32,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz32,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq33,rinv33);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq33);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx33,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy33,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz33,fscal,fiz3);
+            
+            fjx3             = _fjsp_madd_v2r8(dx33,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy33,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz33,fscal,fjz3);
+
+            gmx_fjsp_decrement_4rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+
+            /* Inner loop uses 341 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_fjsp_update_iforce_4atom_swizzle_v2r8(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        ggid                        = gid[iidx];
+        /* Update potential energies */
+        gmx_fjsp_update_1pot_v2r8(velecsum,kernel_data->energygrp_elec+ggid);
+        gmx_fjsp_update_1pot_v2r8(vvdwsum,kernel_data->energygrp_vdw+ggid);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 26 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_VF,outeriter*26 + inneriter*341);
+}
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecCoul_VdwCSTab_GeomW4W4_F_sparc64_hpc_ace_double
+ * Electrostatics interaction: Coulomb
+ * VdW interaction:            CubicSplineTable
+ * Geometry:                   Water4-Water4
+ * Calculate force/pot:        Force
+ */
+void
+nb_kernel_ElecCoul_VdwCSTab_GeomW4W4_F_sparc64_hpc_ace_double
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwioffset3;
+    _fjsp_v2r8       ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+    int              vdwjidx0A,vdwjidx0B;
+    _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    int              vdwjidx1A,vdwjidx1B;
+    _fjsp_v2r8       jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+    int              vdwjidx2A,vdwjidx2B;
+    _fjsp_v2r8       jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+    int              vdwjidx3A,vdwjidx3B;
+    _fjsp_v2r8       jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
+    _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    _fjsp_v2r8       dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+    _fjsp_v2r8       dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+    _fjsp_v2r8       dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
+    _fjsp_v2r8       dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+    _fjsp_v2r8       dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+    _fjsp_v2r8       dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
+    _fjsp_v2r8       dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
+    _fjsp_v2r8       dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
+    _fjsp_v2r8       dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
+    _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
+    _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
+    _fjsp_v2r8       rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF,twovfeps;
+    real             *vftab;
+    _fjsp_v2r8       itab_tmp;
+    _fjsp_v2r8       dummy_mask,cutoff_mask;
+    _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+    _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+    union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+
+    vftab            = kernel_data->table_vdw->data;
+    vftabscale       = gmx_fjsp_set1_v2r8(kernel_data->table_vdw->scale);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+    iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+    iq3              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+3]));
+    vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+    jq1              = gmx_fjsp_set1_v2r8(charge[inr+1]);
+    jq2              = gmx_fjsp_set1_v2r8(charge[inr+2]);
+    jq3              = gmx_fjsp_set1_v2r8(charge[inr+3]);
+    vdwjidx0A        = 2*vdwtype[inr+0];
+    c6_00            = gmx_fjsp_set1_v2r8(vdwparam[vdwioffset0+vdwjidx0A]);
+    c12_00           = gmx_fjsp_set1_v2r8(vdwparam[vdwioffset0+vdwjidx0A+1]);
+    qq11             = _fjsp_mul_v2r8(iq1,jq1);
+    qq12             = _fjsp_mul_v2r8(iq1,jq2);
+    qq13             = _fjsp_mul_v2r8(iq1,jq3);
+    qq21             = _fjsp_mul_v2r8(iq2,jq1);
+    qq22             = _fjsp_mul_v2r8(iq2,jq2);
+    qq23             = _fjsp_mul_v2r8(iq2,jq3);
+    qq31             = _fjsp_mul_v2r8(iq3,jq1);
+    qq32             = _fjsp_mul_v2r8(iq3,jq2);
+    qq33             = _fjsp_mul_v2r8(iq3,jq3);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_fjsp_load_shift_and_4rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
+
+        fix0             = _fjsp_setzero_v2r8();
+        fiy0             = _fjsp_setzero_v2r8();
+        fiz0             = _fjsp_setzero_v2r8();
+        fix1             = _fjsp_setzero_v2r8();
+        fiy1             = _fjsp_setzero_v2r8();
+        fiz1             = _fjsp_setzero_v2r8();
+        fix2             = _fjsp_setzero_v2r8();
+        fiy2             = _fjsp_setzero_v2r8();
+        fiz2             = _fjsp_setzero_v2r8();
+        fix3             = _fjsp_setzero_v2r8();
+        fiy3             = _fjsp_setzero_v2r8();
+        fiz3             = _fjsp_setzero_v2r8();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_4rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
+                                              &jy2,&jz2,&jx3,&jy3,&jz3);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx11             = _fjsp_sub_v2r8(ix1,jx1);
+            dy11             = _fjsp_sub_v2r8(iy1,jy1);
+            dz11             = _fjsp_sub_v2r8(iz1,jz1);
+            dx12             = _fjsp_sub_v2r8(ix1,jx2);
+            dy12             = _fjsp_sub_v2r8(iy1,jy2);
+            dz12             = _fjsp_sub_v2r8(iz1,jz2);
+            dx13             = _fjsp_sub_v2r8(ix1,jx3);
+            dy13             = _fjsp_sub_v2r8(iy1,jy3);
+            dz13             = _fjsp_sub_v2r8(iz1,jz3);
+            dx21             = _fjsp_sub_v2r8(ix2,jx1);
+            dy21             = _fjsp_sub_v2r8(iy2,jy1);
+            dz21             = _fjsp_sub_v2r8(iz2,jz1);
+            dx22             = _fjsp_sub_v2r8(ix2,jx2);
+            dy22             = _fjsp_sub_v2r8(iy2,jy2);
+            dz22             = _fjsp_sub_v2r8(iz2,jz2);
+            dx23             = _fjsp_sub_v2r8(ix2,jx3);
+            dy23             = _fjsp_sub_v2r8(iy2,jy3);
+            dz23             = _fjsp_sub_v2r8(iz2,jz3);
+            dx31             = _fjsp_sub_v2r8(ix3,jx1);
+            dy31             = _fjsp_sub_v2r8(iy3,jy1);
+            dz31             = _fjsp_sub_v2r8(iz3,jz1);
+            dx32             = _fjsp_sub_v2r8(ix3,jx2);
+            dy32             = _fjsp_sub_v2r8(iy3,jy2);
+            dz32             = _fjsp_sub_v2r8(iz3,jz2);
+            dx33             = _fjsp_sub_v2r8(ix3,jx3);
+            dy33             = _fjsp_sub_v2r8(iy3,jy3);
+            dz33             = _fjsp_sub_v2r8(iz3,jz3);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+            rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+            rsq13            = gmx_fjsp_calc_rsq_v2r8(dx13,dy13,dz13);
+            rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+            rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+            rsq23            = gmx_fjsp_calc_rsq_v2r8(dx23,dy23,dz23);
+            rsq31            = gmx_fjsp_calc_rsq_v2r8(dx31,dy31,dz31);
+            rsq32            = gmx_fjsp_calc_rsq_v2r8(dx32,dy32,dz32);
+            rsq33            = gmx_fjsp_calc_rsq_v2r8(dx33,dy33,dz33);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+            rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+            rinv13           = gmx_fjsp_invsqrt_v2r8(rsq13);
+            rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+            rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+            rinv23           = gmx_fjsp_invsqrt_v2r8(rsq23);
+            rinv31           = gmx_fjsp_invsqrt_v2r8(rsq31);
+            rinv32           = gmx_fjsp_invsqrt_v2r8(rsq32);
+            rinv33           = gmx_fjsp_invsqrt_v2r8(rsq33);
+
+            rinvsq11         = _fjsp_mul_v2r8(rinv11,rinv11);
+            rinvsq12         = _fjsp_mul_v2r8(rinv12,rinv12);
+            rinvsq13         = _fjsp_mul_v2r8(rinv13,rinv13);
+            rinvsq21         = _fjsp_mul_v2r8(rinv21,rinv21);
+            rinvsq22         = _fjsp_mul_v2r8(rinv22,rinv22);
+            rinvsq23         = _fjsp_mul_v2r8(rinv23,rinv23);
+            rinvsq31         = _fjsp_mul_v2r8(rinv31,rinv31);
+            rinvsq32         = _fjsp_mul_v2r8(rinv32,rinv32);
+            rinvsq33         = _fjsp_mul_v2r8(rinv33,rinv33);
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+            fjx1             = _fjsp_setzero_v2r8();
+            fjy1             = _fjsp_setzero_v2r8();
+            fjz1             = _fjsp_setzero_v2r8();
+            fjx2             = _fjsp_setzero_v2r8();
+            fjy2             = _fjsp_setzero_v2r8();
+            fjz2             = _fjsp_setzero_v2r8();
+            fjx3             = _fjsp_setzero_v2r8();
+            fjy3             = _fjsp_setzero_v2r8();
+            fjz3             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r00,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 8;
+            vfconv.i[1]     *= 8;
+
+            /* CUBIC SPLINE TABLE DISPERSION */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 2 );
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 2 );
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+            FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+            fvdw6            = _fjsp_mul_v2r8(c6_00,FF);
+
+            /* CUBIC SPLINE TABLE REPULSION */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 4 );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 4 );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 6 );
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 6 );
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+            FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+            fvdw12           = _fjsp_mul_v2r8(c12_00,FF);
+            fvdw             = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_add_v2r8(fvdw6,fvdw12),_fjsp_mul_v2r8(vftabscale,rinv00)));
+
+            fscal            = fvdw;
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq11,rinv11);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq11);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+            
+            fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq12,rinv12);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq12);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+            
+            fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq13,rinv13);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq13);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx13,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy13,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz13,fscal,fiz1);
+            
+            fjx3             = _fjsp_madd_v2r8(dx13,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy13,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz13,fscal,fjz3);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq21,rinv21);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq21);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+            
+            fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq22,rinv22);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq22);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+            
+            fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq23,rinv23);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq23);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx23,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy23,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz23,fscal,fiz2);
+            
+            fjx3             = _fjsp_madd_v2r8(dx23,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy23,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz23,fscal,fjz3);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq31,rinv31);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq31);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx31,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy31,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz31,fscal,fiz3);
+            
+            fjx1             = _fjsp_madd_v2r8(dx31,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy31,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz31,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq32,rinv32);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq32);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx32,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy32,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz32,fscal,fiz3);
+            
+            fjx2             = _fjsp_madd_v2r8(dx32,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy32,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz32,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq33,rinv33);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq33);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx33,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy33,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz33,fscal,fiz3);
+            
+            fjx3             = _fjsp_madd_v2r8(dx33,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy33,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz33,fscal,fjz3);
+
+            gmx_fjsp_decrement_4rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+
+            /* Inner loop uses 324 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_4rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
+                                              &jy2,&jz2,&jx3,&jy3,&jz3);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx11             = _fjsp_sub_v2r8(ix1,jx1);
+            dy11             = _fjsp_sub_v2r8(iy1,jy1);
+            dz11             = _fjsp_sub_v2r8(iz1,jz1);
+            dx12             = _fjsp_sub_v2r8(ix1,jx2);
+            dy12             = _fjsp_sub_v2r8(iy1,jy2);
+            dz12             = _fjsp_sub_v2r8(iz1,jz2);
+            dx13             = _fjsp_sub_v2r8(ix1,jx3);
+            dy13             = _fjsp_sub_v2r8(iy1,jy3);
+            dz13             = _fjsp_sub_v2r8(iz1,jz3);
+            dx21             = _fjsp_sub_v2r8(ix2,jx1);
+            dy21             = _fjsp_sub_v2r8(iy2,jy1);
+            dz21             = _fjsp_sub_v2r8(iz2,jz1);
+            dx22             = _fjsp_sub_v2r8(ix2,jx2);
+            dy22             = _fjsp_sub_v2r8(iy2,jy2);
+            dz22             = _fjsp_sub_v2r8(iz2,jz2);
+            dx23             = _fjsp_sub_v2r8(ix2,jx3);
+            dy23             = _fjsp_sub_v2r8(iy2,jy3);
+            dz23             = _fjsp_sub_v2r8(iz2,jz3);
+            dx31             = _fjsp_sub_v2r8(ix3,jx1);
+            dy31             = _fjsp_sub_v2r8(iy3,jy1);
+            dz31             = _fjsp_sub_v2r8(iz3,jz1);
+            dx32             = _fjsp_sub_v2r8(ix3,jx2);
+            dy32             = _fjsp_sub_v2r8(iy3,jy2);
+            dz32             = _fjsp_sub_v2r8(iz3,jz2);
+            dx33             = _fjsp_sub_v2r8(ix3,jx3);
+            dy33             = _fjsp_sub_v2r8(iy3,jy3);
+            dz33             = _fjsp_sub_v2r8(iz3,jz3);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+            rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+            rsq13            = gmx_fjsp_calc_rsq_v2r8(dx13,dy13,dz13);
+            rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+            rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+            rsq23            = gmx_fjsp_calc_rsq_v2r8(dx23,dy23,dz23);
+            rsq31            = gmx_fjsp_calc_rsq_v2r8(dx31,dy31,dz31);
+            rsq32            = gmx_fjsp_calc_rsq_v2r8(dx32,dy32,dz32);
+            rsq33            = gmx_fjsp_calc_rsq_v2r8(dx33,dy33,dz33);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+            rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+            rinv13           = gmx_fjsp_invsqrt_v2r8(rsq13);
+            rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+            rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+            rinv23           = gmx_fjsp_invsqrt_v2r8(rsq23);
+            rinv31           = gmx_fjsp_invsqrt_v2r8(rsq31);
+            rinv32           = gmx_fjsp_invsqrt_v2r8(rsq32);
+            rinv33           = gmx_fjsp_invsqrt_v2r8(rsq33);
+
+            rinvsq11         = _fjsp_mul_v2r8(rinv11,rinv11);
+            rinvsq12         = _fjsp_mul_v2r8(rinv12,rinv12);
+            rinvsq13         = _fjsp_mul_v2r8(rinv13,rinv13);
+            rinvsq21         = _fjsp_mul_v2r8(rinv21,rinv21);
+            rinvsq22         = _fjsp_mul_v2r8(rinv22,rinv22);
+            rinvsq23         = _fjsp_mul_v2r8(rinv23,rinv23);
+            rinvsq31         = _fjsp_mul_v2r8(rinv31,rinv31);
+            rinvsq32         = _fjsp_mul_v2r8(rinv32,rinv32);
+            rinvsq33         = _fjsp_mul_v2r8(rinv33,rinv33);
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+            fjx1             = _fjsp_setzero_v2r8();
+            fjy1             = _fjsp_setzero_v2r8();
+            fjz1             = _fjsp_setzero_v2r8();
+            fjx2             = _fjsp_setzero_v2r8();
+            fjy2             = _fjsp_setzero_v2r8();
+            fjz2             = _fjsp_setzero_v2r8();
+            fjx3             = _fjsp_setzero_v2r8();
+            fjy3             = _fjsp_setzero_v2r8();
+            fjz3             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r00,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 8;
+            vfconv.i[1]     *= 8;
+
+            /* CUBIC SPLINE TABLE DISPERSION */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 2 );
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+            FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+            fvdw6            = _fjsp_mul_v2r8(c6_00,FF);
+
+            /* CUBIC SPLINE TABLE REPULSION */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 4 );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 6 );
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+            FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+            fvdw12           = _fjsp_mul_v2r8(c12_00,FF);
+            fvdw             = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_add_v2r8(fvdw6,fvdw12),_fjsp_mul_v2r8(vftabscale,rinv00)));
+
+            fscal            = fvdw;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq11,rinv11);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq11);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+            
+            fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq12,rinv12);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq12);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+            
+            fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq13,rinv13);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq13);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx13,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy13,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz13,fscal,fiz1);
+            
+            fjx3             = _fjsp_madd_v2r8(dx13,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy13,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz13,fscal,fjz3);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq21,rinv21);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq21);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+            
+            fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq22,rinv22);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq22);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+            
+            fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq23,rinv23);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq23);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx23,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy23,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz23,fscal,fiz2);
+            
+            fjx3             = _fjsp_madd_v2r8(dx23,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy23,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz23,fscal,fjz3);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq31,rinv31);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq31);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx31,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy31,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz31,fscal,fiz3);
+            
+            fjx1             = _fjsp_madd_v2r8(dx31,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy31,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz31,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq32,rinv32);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq32);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx32,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy32,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz32,fscal,fiz3);
+            
+            fjx2             = _fjsp_madd_v2r8(dx32,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy32,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz32,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq33,rinv33);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq33);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx33,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy33,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz33,fscal,fiz3);
+            
+            fjx3             = _fjsp_madd_v2r8(dx33,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy33,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz33,fscal,fjz3);
+
+            gmx_fjsp_decrement_4rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+
+            /* Inner loop uses 324 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_fjsp_update_iforce_4atom_swizzle_v2r8(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 24 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_F,outeriter*24 + inneriter*324);
+}
diff --git a/src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecCoul_VdwLJ_GeomP1P1_sparc64_hpc_ace_double.c b/src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecCoul_VdwLJ_GeomP1P1_sparc64_hpc_ace_double.c
new file mode 100644 (file)
index 0000000..c5950bc
--- /dev/null
@@ -0,0 +1,545 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2012, by the GROMACS development team, led by
+ * David van der Spoel, Berk Hess, Erik Lindahl, and including many
+ * others, as listed in the AUTHORS file in the top-level source
+ * directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*
+ * Note: this file was generated by the GROMACS sparc64_hpc_ace_double kernel generator.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "kernelutil_sparc64_hpc_ace_double.h"
+
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecCoul_VdwLJ_GeomP1P1_VF_sparc64_hpc_ace_double
+ * Electrostatics interaction: Coulomb
+ * VdW interaction:            LennardJones
+ * Geometry:                   Particle-Particle
+ * Calculate force/pot:        PotentialAndForce
+ */
+void
+nb_kernel_ElecCoul_VdwLJ_GeomP1P1_VF_sparc64_hpc_ace_double
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwjidx0A,vdwjidx0B;
+    _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
+    _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
+    _fjsp_v2r8       itab_tmp;
+    _fjsp_v2r8       dummy_mask,cutoff_mask;
+    _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+    _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+    union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_fjsp_load_shift_and_1rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,&ix0,&iy0,&iz0);
+
+        fix0             = _fjsp_setzero_v2r8();
+        fiy0             = _fjsp_setzero_v2r8();
+        fiz0             = _fjsp_setzero_v2r8();
+
+        /* Load parameters for i particles */
+        iq0              = _fjsp_mul_v2r8(facel,gmx_fjsp_load1_v2r8(charge+inr+0));
+        vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+        /* Reset potential sums */
+        velecsum         = _fjsp_setzero_v2r8();
+        vvdwsum          = _fjsp_setzero_v2r8();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_fjsp_load_2real_swizzle_v2r8(charge+jnrA+0,charge+jnrB+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _fjsp_mul_v2r8(iq0,jq0);
+            gmx_fjsp_load_2pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,&c6_00,&c12_00);
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq00,rinv00);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq00);
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+            vvdw6            = _fjsp_mul_v2r8(c6_00,rinvsix);
+            vvdw12           = _fjsp_mul_v2r8(c12_00,_fjsp_mul_v2r8(rinvsix,rinvsix));
+            vvdw             = _fjsp_msub_v2r8( vvdw12,one_twelfth, _fjsp_mul_v2r8(vvdw6,one_sixth) );
+            fvdw             = _fjsp_mul_v2r8(_fjsp_sub_v2r8(vvdw12,vvdw6),rinvsq00);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+            vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
+
+            fscal            = _fjsp_add_v2r8(felec,fvdw);
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            gmx_fjsp_decrement_fma_1rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fscal,dx00,dy00,dz00);
+
+            /* Inner loop uses 43 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+
+            /* Load parameters for j particles */
+            jq0              = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),charge+jnrA+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _fjsp_mul_v2r8(iq0,jq0);
+            gmx_fjsp_load_1pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,&c6_00,&c12_00);
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq00,rinv00);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq00);
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+            vvdw6            = _fjsp_mul_v2r8(c6_00,rinvsix);
+            vvdw12           = _fjsp_mul_v2r8(c12_00,_fjsp_mul_v2r8(rinvsix,rinvsix));
+            vvdw             = _fjsp_msub_v2r8( vvdw12,one_twelfth, _fjsp_mul_v2r8(vvdw6,one_sixth) );
+            fvdw             = _fjsp_mul_v2r8(_fjsp_sub_v2r8(vvdw12,vvdw6),rinvsq00);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+            vvdw             = _fjsp_unpacklo_v2r8(vvdw,_fjsp_setzero_v2r8());
+            vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
+
+            fscal            = _fjsp_add_v2r8(felec,fvdw);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            gmx_fjsp_decrement_fma_1rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fscal,dx00,dy00,dz00);
+
+            /* Inner loop uses 43 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_fjsp_update_iforce_1atom_swizzle_v2r8(fix0,fiy0,fiz0,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        ggid                        = gid[iidx];
+        /* Update potential energies */
+        gmx_fjsp_update_1pot_v2r8(velecsum,kernel_data->energygrp_elec+ggid);
+        gmx_fjsp_update_1pot_v2r8(vvdwsum,kernel_data->energygrp_vdw+ggid);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 9 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_VF,outeriter*9 + inneriter*43);
+}
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecCoul_VdwLJ_GeomP1P1_F_sparc64_hpc_ace_double
+ * Electrostatics interaction: Coulomb
+ * VdW interaction:            LennardJones
+ * Geometry:                   Particle-Particle
+ * Calculate force/pot:        Force
+ */
+void
+nb_kernel_ElecCoul_VdwLJ_GeomP1P1_F_sparc64_hpc_ace_double
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwjidx0A,vdwjidx0B;
+    _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
+    _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
+    _fjsp_v2r8       itab_tmp;
+    _fjsp_v2r8       dummy_mask,cutoff_mask;
+    _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+    _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+    union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_fjsp_load_shift_and_1rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,&ix0,&iy0,&iz0);
+
+        fix0             = _fjsp_setzero_v2r8();
+        fiy0             = _fjsp_setzero_v2r8();
+        fiz0             = _fjsp_setzero_v2r8();
+
+        /* Load parameters for i particles */
+        iq0              = _fjsp_mul_v2r8(facel,gmx_fjsp_load1_v2r8(charge+inr+0));
+        vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_fjsp_load_2real_swizzle_v2r8(charge+jnrA+0,charge+jnrB+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _fjsp_mul_v2r8(iq0,jq0);
+            gmx_fjsp_load_2pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,&c6_00,&c12_00);
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq00,rinv00);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq00);
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+            fvdw             = _fjsp_mul_v2r8(_fjsp_msub_v2r8(c12_00,rinvsix,c6_00),_fjsp_mul_v2r8(rinvsix,rinvsq00));
+
+            fscal            = _fjsp_add_v2r8(felec,fvdw);
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            gmx_fjsp_decrement_fma_1rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fscal,dx00,dy00,dz00);
+
+            /* Inner loop uses 37 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+
+            /* Load parameters for j particles */
+            jq0              = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),charge+jnrA+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _fjsp_mul_v2r8(iq0,jq0);
+            gmx_fjsp_load_1pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,&c6_00,&c12_00);
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq00,rinv00);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq00);
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+            fvdw             = _fjsp_mul_v2r8(_fjsp_msub_v2r8(c12_00,rinvsix,c6_00),_fjsp_mul_v2r8(rinvsix,rinvsq00));
+
+            fscal            = _fjsp_add_v2r8(felec,fvdw);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            gmx_fjsp_decrement_fma_1rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fscal,dx00,dy00,dz00);
+
+            /* Inner loop uses 37 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_fjsp_update_iforce_1atom_swizzle_v2r8(fix0,fiy0,fiz0,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 7 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_F,outeriter*7 + inneriter*37);
+}
diff --git a/src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecCoul_VdwLJ_GeomW3P1_sparc64_hpc_ace_double.c b/src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecCoul_VdwLJ_GeomW3P1_sparc64_hpc_ace_double.c
new file mode 100644 (file)
index 0000000..83f4c9b
--- /dev/null
@@ -0,0 +1,855 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2012, by the GROMACS development team, led by
+ * David van der Spoel, Berk Hess, Erik Lindahl, and including many
+ * others, as listed in the AUTHORS file in the top-level source
+ * directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*
+ * Note: this file was generated by the GROMACS sparc64_hpc_ace_double kernel generator.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "kernelutil_sparc64_hpc_ace_double.h"
+
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecCoul_VdwLJ_GeomW3P1_VF_sparc64_hpc_ace_double
+ * Electrostatics interaction: Coulomb
+ * VdW interaction:            LennardJones
+ * Geometry:                   Water3-Particle
+ * Calculate force/pot:        PotentialAndForce
+ */
+void
+nb_kernel_ElecCoul_VdwLJ_GeomW3P1_VF_sparc64_hpc_ace_double
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwjidx0A,vdwjidx0B;
+    _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    _fjsp_v2r8       dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    _fjsp_v2r8       dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
+    _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
+    _fjsp_v2r8       itab_tmp;
+    _fjsp_v2r8       dummy_mask,cutoff_mask;
+    _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+    _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+    union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq0              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+0]));
+    iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+    iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+    vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_fjsp_load_shift_and_3rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
+
+        fix0             = _fjsp_setzero_v2r8();
+        fiy0             = _fjsp_setzero_v2r8();
+        fiz0             = _fjsp_setzero_v2r8();
+        fix1             = _fjsp_setzero_v2r8();
+        fiy1             = _fjsp_setzero_v2r8();
+        fiz1             = _fjsp_setzero_v2r8();
+        fix2             = _fjsp_setzero_v2r8();
+        fiy2             = _fjsp_setzero_v2r8();
+        fiz2             = _fjsp_setzero_v2r8();
+
+        /* Reset potential sums */
+        velecsum         = _fjsp_setzero_v2r8();
+        vvdwsum          = _fjsp_setzero_v2r8();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+            rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+            rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_fjsp_load_2real_swizzle_v2r8(charge+jnrA+0,charge+jnrB+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _fjsp_mul_v2r8(iq0,jq0);
+            gmx_fjsp_load_2pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,&c6_00,&c12_00);
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq00,rinv00);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq00);
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+            vvdw6            = _fjsp_mul_v2r8(c6_00,rinvsix);
+            vvdw12           = _fjsp_mul_v2r8(c12_00,_fjsp_mul_v2r8(rinvsix,rinvsix));
+            vvdw             = _fjsp_msub_v2r8( vvdw12,one_twelfth, _fjsp_mul_v2r8(vvdw6,one_sixth) );
+            fvdw             = _fjsp_mul_v2r8(_fjsp_sub_v2r8(vvdw12,vvdw6),rinvsq00);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+            vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
+
+            fscal            = _fjsp_add_v2r8(felec,fvdw);
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _fjsp_mul_v2r8(iq1,jq0);
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq10,rinv10);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq10);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _fjsp_mul_v2r8(iq2,jq0);
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq20,rinv20);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq20);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            gmx_fjsp_decrement_1rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 108 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+            rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+            rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+
+            /* Load parameters for j particles */
+            jq0              = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),charge+jnrA+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _fjsp_mul_v2r8(iq0,jq0);
+            gmx_fjsp_load_1pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,&c6_00,&c12_00);
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq00,rinv00);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq00);
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+            vvdw6            = _fjsp_mul_v2r8(c6_00,rinvsix);
+            vvdw12           = _fjsp_mul_v2r8(c12_00,_fjsp_mul_v2r8(rinvsix,rinvsix));
+            vvdw             = _fjsp_msub_v2r8( vvdw12,one_twelfth, _fjsp_mul_v2r8(vvdw6,one_sixth) );
+            fvdw             = _fjsp_mul_v2r8(_fjsp_sub_v2r8(vvdw12,vvdw6),rinvsq00);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+            vvdw             = _fjsp_unpacklo_v2r8(vvdw,_fjsp_setzero_v2r8());
+            vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
+
+            fscal            = _fjsp_add_v2r8(felec,fvdw);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _fjsp_mul_v2r8(iq1,jq0);
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq10,rinv10);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq10);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _fjsp_mul_v2r8(iq2,jq0);
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq20,rinv20);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq20);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            gmx_fjsp_decrement_1rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 108 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_fjsp_update_iforce_3atom_swizzle_v2r8(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        ggid                        = gid[iidx];
+        /* Update potential energies */
+        gmx_fjsp_update_1pot_v2r8(velecsum,kernel_data->energygrp_elec+ggid);
+        gmx_fjsp_update_1pot_v2r8(vvdwsum,kernel_data->energygrp_vdw+ggid);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 20 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3_VF,outeriter*20 + inneriter*108);
+}
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecCoul_VdwLJ_GeomW3P1_F_sparc64_hpc_ace_double
+ * Electrostatics interaction: Coulomb
+ * VdW interaction:            LennardJones
+ * Geometry:                   Water3-Particle
+ * Calculate force/pot:        Force
+ */
+void
+nb_kernel_ElecCoul_VdwLJ_GeomW3P1_F_sparc64_hpc_ace_double
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwjidx0A,vdwjidx0B;
+    _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    _fjsp_v2r8       dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    _fjsp_v2r8       dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
+    _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
+    _fjsp_v2r8       itab_tmp;
+    _fjsp_v2r8       dummy_mask,cutoff_mask;
+    _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+    _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+    union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq0              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+0]));
+    iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+    iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+    vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_fjsp_load_shift_and_3rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
+
+        fix0             = _fjsp_setzero_v2r8();
+        fiy0             = _fjsp_setzero_v2r8();
+        fiz0             = _fjsp_setzero_v2r8();
+        fix1             = _fjsp_setzero_v2r8();
+        fiy1             = _fjsp_setzero_v2r8();
+        fiz1             = _fjsp_setzero_v2r8();
+        fix2             = _fjsp_setzero_v2r8();
+        fiy2             = _fjsp_setzero_v2r8();
+        fiz2             = _fjsp_setzero_v2r8();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+            rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+            rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_fjsp_load_2real_swizzle_v2r8(charge+jnrA+0,charge+jnrB+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _fjsp_mul_v2r8(iq0,jq0);
+            gmx_fjsp_load_2pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,&c6_00,&c12_00);
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq00,rinv00);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq00);
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+            fvdw             = _fjsp_mul_v2r8(_fjsp_msub_v2r8(c12_00,rinvsix,c6_00),_fjsp_mul_v2r8(rinvsix,rinvsq00));
+
+            fscal            = _fjsp_add_v2r8(felec,fvdw);
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _fjsp_mul_v2r8(iq1,jq0);
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq10,rinv10);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq10);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _fjsp_mul_v2r8(iq2,jq0);
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq20,rinv20);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq20);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            gmx_fjsp_decrement_1rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 100 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+            rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+            rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+
+            /* Load parameters for j particles */
+            jq0              = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),charge+jnrA+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _fjsp_mul_v2r8(iq0,jq0);
+            gmx_fjsp_load_1pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,&c6_00,&c12_00);
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq00,rinv00);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq00);
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+            fvdw             = _fjsp_mul_v2r8(_fjsp_msub_v2r8(c12_00,rinvsix,c6_00),_fjsp_mul_v2r8(rinvsix,rinvsq00));
+
+            fscal            = _fjsp_add_v2r8(felec,fvdw);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _fjsp_mul_v2r8(iq1,jq0);
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq10,rinv10);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq10);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _fjsp_mul_v2r8(iq2,jq0);
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq20,rinv20);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq20);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            gmx_fjsp_decrement_1rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 100 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_fjsp_update_iforce_3atom_swizzle_v2r8(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 18 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3_F,outeriter*18 + inneriter*100);
+}
diff --git a/src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecCoul_VdwLJ_GeomW3W3_sparc64_hpc_ace_double.c b/src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecCoul_VdwLJ_GeomW3W3_sparc64_hpc_ace_double.c
new file mode 100644 (file)
index 0000000..56f53a1
--- /dev/null
@@ -0,0 +1,1537 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2012, by the GROMACS development team, led by
+ * David van der Spoel, Berk Hess, Erik Lindahl, and including many
+ * others, as listed in the AUTHORS file in the top-level source
+ * directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*
+ * Note: this file was generated by the GROMACS sparc64_hpc_ace_double kernel generator.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "kernelutil_sparc64_hpc_ace_double.h"
+
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecCoul_VdwLJ_GeomW3W3_VF_sparc64_hpc_ace_double
+ * Electrostatics interaction: Coulomb
+ * VdW interaction:            LennardJones
+ * Geometry:                   Water3-Water3
+ * Calculate force/pot:        PotentialAndForce
+ */
+void
+nb_kernel_ElecCoul_VdwLJ_GeomW3W3_VF_sparc64_hpc_ace_double
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwjidx0A,vdwjidx0B;
+    _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    int              vdwjidx1A,vdwjidx1B;
+    _fjsp_v2r8       jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+    int              vdwjidx2A,vdwjidx2B;
+    _fjsp_v2r8       jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+    _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    _fjsp_v2r8       dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
+    _fjsp_v2r8       dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
+    _fjsp_v2r8       dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    _fjsp_v2r8       dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+    _fjsp_v2r8       dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+    _fjsp_v2r8       dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    _fjsp_v2r8       dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+    _fjsp_v2r8       dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+    _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
+    _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
+    _fjsp_v2r8       itab_tmp;
+    _fjsp_v2r8       dummy_mask,cutoff_mask;
+    _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+    _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+    union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq0              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+0]));
+    iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+    iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+    vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+    jq0              = gmx_fjsp_set1_v2r8(charge[inr+0]);
+    jq1              = gmx_fjsp_set1_v2r8(charge[inr+1]);
+    jq2              = gmx_fjsp_set1_v2r8(charge[inr+2]);
+    vdwjidx0A        = 2*vdwtype[inr+0];
+    qq00             = _fjsp_mul_v2r8(iq0,jq0);
+    c6_00            = gmx_fjsp_set1_v2r8(vdwparam[vdwioffset0+vdwjidx0A]);
+    c12_00           = gmx_fjsp_set1_v2r8(vdwparam[vdwioffset0+vdwjidx0A+1]);
+    qq01             = _fjsp_mul_v2r8(iq0,jq1);
+    qq02             = _fjsp_mul_v2r8(iq0,jq2);
+    qq10             = _fjsp_mul_v2r8(iq1,jq0);
+    qq11             = _fjsp_mul_v2r8(iq1,jq1);
+    qq12             = _fjsp_mul_v2r8(iq1,jq2);
+    qq20             = _fjsp_mul_v2r8(iq2,jq0);
+    qq21             = _fjsp_mul_v2r8(iq2,jq1);
+    qq22             = _fjsp_mul_v2r8(iq2,jq2);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_fjsp_load_shift_and_3rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
+
+        fix0             = _fjsp_setzero_v2r8();
+        fiy0             = _fjsp_setzero_v2r8();
+        fiz0             = _fjsp_setzero_v2r8();
+        fix1             = _fjsp_setzero_v2r8();
+        fiy1             = _fjsp_setzero_v2r8();
+        fiz1             = _fjsp_setzero_v2r8();
+        fix2             = _fjsp_setzero_v2r8();
+        fiy2             = _fjsp_setzero_v2r8();
+        fiz2             = _fjsp_setzero_v2r8();
+
+        /* Reset potential sums */
+        velecsum         = _fjsp_setzero_v2r8();
+        vvdwsum          = _fjsp_setzero_v2r8();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_3rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx01             = _fjsp_sub_v2r8(ix0,jx1);
+            dy01             = _fjsp_sub_v2r8(iy0,jy1);
+            dz01             = _fjsp_sub_v2r8(iz0,jz1);
+            dx02             = _fjsp_sub_v2r8(ix0,jx2);
+            dy02             = _fjsp_sub_v2r8(iy0,jy2);
+            dz02             = _fjsp_sub_v2r8(iz0,jz2);
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx11             = _fjsp_sub_v2r8(ix1,jx1);
+            dy11             = _fjsp_sub_v2r8(iy1,jy1);
+            dz11             = _fjsp_sub_v2r8(iz1,jz1);
+            dx12             = _fjsp_sub_v2r8(ix1,jx2);
+            dy12             = _fjsp_sub_v2r8(iy1,jy2);
+            dz12             = _fjsp_sub_v2r8(iz1,jz2);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+            dx21             = _fjsp_sub_v2r8(ix2,jx1);
+            dy21             = _fjsp_sub_v2r8(iy2,jy1);
+            dz21             = _fjsp_sub_v2r8(iz2,jz1);
+            dx22             = _fjsp_sub_v2r8(ix2,jx2);
+            dy22             = _fjsp_sub_v2r8(iy2,jy2);
+            dz22             = _fjsp_sub_v2r8(iz2,jz2);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq01            = gmx_fjsp_calc_rsq_v2r8(dx01,dy01,dz01);
+            rsq02            = gmx_fjsp_calc_rsq_v2r8(dx02,dy02,dz02);
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+            rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+            rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+            rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv01           = gmx_fjsp_invsqrt_v2r8(rsq01);
+            rinv02           = gmx_fjsp_invsqrt_v2r8(rsq02);
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+            rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+            rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+            rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+            rinvsq01         = _fjsp_mul_v2r8(rinv01,rinv01);
+            rinvsq02         = _fjsp_mul_v2r8(rinv02,rinv02);
+            rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+            rinvsq11         = _fjsp_mul_v2r8(rinv11,rinv11);
+            rinvsq12         = _fjsp_mul_v2r8(rinv12,rinv12);
+            rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+            rinvsq21         = _fjsp_mul_v2r8(rinv21,rinv21);
+            rinvsq22         = _fjsp_mul_v2r8(rinv22,rinv22);
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+            fjx1             = _fjsp_setzero_v2r8();
+            fjy1             = _fjsp_setzero_v2r8();
+            fjz1             = _fjsp_setzero_v2r8();
+            fjx2             = _fjsp_setzero_v2r8();
+            fjy2             = _fjsp_setzero_v2r8();
+            fjz2             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq00,rinv00);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq00);
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+            vvdw6            = _fjsp_mul_v2r8(c6_00,rinvsix);
+            vvdw12           = _fjsp_mul_v2r8(c12_00,_fjsp_mul_v2r8(rinvsix,rinvsix));
+            vvdw             = _fjsp_msub_v2r8( vvdw12,one_twelfth, _fjsp_mul_v2r8(vvdw6,one_sixth) );
+            fvdw             = _fjsp_mul_v2r8(_fjsp_sub_v2r8(vvdw12,vvdw6),rinvsq00);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+            vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
+
+            fscal            = _fjsp_add_v2r8(felec,fvdw);
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq01,rinv01);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq01);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx01,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy01,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz01,fscal,fiz0);
+            
+            fjx1             = _fjsp_madd_v2r8(dx01,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy01,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz01,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq02,rinv02);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq02);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx02,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy02,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz02,fscal,fiz0);
+            
+            fjx2             = _fjsp_madd_v2r8(dx02,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy02,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz02,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq10,rinv10);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq10);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq11,rinv11);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq11);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+            
+            fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq12,rinv12);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq12);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+            
+            fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq20,rinv20);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq20);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq21,rinv21);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq21);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+            
+            fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq22,rinv22);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq22);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+            
+            fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+
+            gmx_fjsp_decrement_3rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+
+            /* Inner loop uses 291 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_3rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx01             = _fjsp_sub_v2r8(ix0,jx1);
+            dy01             = _fjsp_sub_v2r8(iy0,jy1);
+            dz01             = _fjsp_sub_v2r8(iz0,jz1);
+            dx02             = _fjsp_sub_v2r8(ix0,jx2);
+            dy02             = _fjsp_sub_v2r8(iy0,jy2);
+            dz02             = _fjsp_sub_v2r8(iz0,jz2);
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx11             = _fjsp_sub_v2r8(ix1,jx1);
+            dy11             = _fjsp_sub_v2r8(iy1,jy1);
+            dz11             = _fjsp_sub_v2r8(iz1,jz1);
+            dx12             = _fjsp_sub_v2r8(ix1,jx2);
+            dy12             = _fjsp_sub_v2r8(iy1,jy2);
+            dz12             = _fjsp_sub_v2r8(iz1,jz2);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+            dx21             = _fjsp_sub_v2r8(ix2,jx1);
+            dy21             = _fjsp_sub_v2r8(iy2,jy1);
+            dz21             = _fjsp_sub_v2r8(iz2,jz1);
+            dx22             = _fjsp_sub_v2r8(ix2,jx2);
+            dy22             = _fjsp_sub_v2r8(iy2,jy2);
+            dz22             = _fjsp_sub_v2r8(iz2,jz2);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq01            = gmx_fjsp_calc_rsq_v2r8(dx01,dy01,dz01);
+            rsq02            = gmx_fjsp_calc_rsq_v2r8(dx02,dy02,dz02);
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+            rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+            rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+            rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv01           = gmx_fjsp_invsqrt_v2r8(rsq01);
+            rinv02           = gmx_fjsp_invsqrt_v2r8(rsq02);
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+            rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+            rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+            rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+            rinvsq01         = _fjsp_mul_v2r8(rinv01,rinv01);
+            rinvsq02         = _fjsp_mul_v2r8(rinv02,rinv02);
+            rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+            rinvsq11         = _fjsp_mul_v2r8(rinv11,rinv11);
+            rinvsq12         = _fjsp_mul_v2r8(rinv12,rinv12);
+            rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+            rinvsq21         = _fjsp_mul_v2r8(rinv21,rinv21);
+            rinvsq22         = _fjsp_mul_v2r8(rinv22,rinv22);
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+            fjx1             = _fjsp_setzero_v2r8();
+            fjy1             = _fjsp_setzero_v2r8();
+            fjz1             = _fjsp_setzero_v2r8();
+            fjx2             = _fjsp_setzero_v2r8();
+            fjy2             = _fjsp_setzero_v2r8();
+            fjz2             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq00,rinv00);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq00);
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+            vvdw6            = _fjsp_mul_v2r8(c6_00,rinvsix);
+            vvdw12           = _fjsp_mul_v2r8(c12_00,_fjsp_mul_v2r8(rinvsix,rinvsix));
+            vvdw             = _fjsp_msub_v2r8( vvdw12,one_twelfth, _fjsp_mul_v2r8(vvdw6,one_sixth) );
+            fvdw             = _fjsp_mul_v2r8(_fjsp_sub_v2r8(vvdw12,vvdw6),rinvsq00);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+            vvdw             = _fjsp_unpacklo_v2r8(vvdw,_fjsp_setzero_v2r8());
+            vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
+
+            fscal            = _fjsp_add_v2r8(felec,fvdw);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq01,rinv01);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq01);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx01,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy01,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz01,fscal,fiz0);
+            
+            fjx1             = _fjsp_madd_v2r8(dx01,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy01,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz01,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq02,rinv02);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq02);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx02,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy02,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz02,fscal,fiz0);
+            
+            fjx2             = _fjsp_madd_v2r8(dx02,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy02,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz02,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq10,rinv10);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq10);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq11,rinv11);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq11);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+            
+            fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq12,rinv12);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq12);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+            
+            fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq20,rinv20);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq20);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq21,rinv21);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq21);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+            
+            fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq22,rinv22);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq22);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+            
+            fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+
+            gmx_fjsp_decrement_3rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+
+            /* Inner loop uses 291 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_fjsp_update_iforce_3atom_swizzle_v2r8(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        ggid                        = gid[iidx];
+        /* Update potential energies */
+        gmx_fjsp_update_1pot_v2r8(velecsum,kernel_data->energygrp_elec+ggid);
+        gmx_fjsp_update_1pot_v2r8(vvdwsum,kernel_data->energygrp_vdw+ggid);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 20 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_VF,outeriter*20 + inneriter*291);
+}
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecCoul_VdwLJ_GeomW3W3_F_sparc64_hpc_ace_double
+ * Electrostatics interaction: Coulomb
+ * VdW interaction:            LennardJones
+ * Geometry:                   Water3-Water3
+ * Calculate force/pot:        Force
+ */
+void
+nb_kernel_ElecCoul_VdwLJ_GeomW3W3_F_sparc64_hpc_ace_double
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwjidx0A,vdwjidx0B;
+    _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    int              vdwjidx1A,vdwjidx1B;
+    _fjsp_v2r8       jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+    int              vdwjidx2A,vdwjidx2B;
+    _fjsp_v2r8       jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+    _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    _fjsp_v2r8       dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
+    _fjsp_v2r8       dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
+    _fjsp_v2r8       dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    _fjsp_v2r8       dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+    _fjsp_v2r8       dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+    _fjsp_v2r8       dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    _fjsp_v2r8       dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+    _fjsp_v2r8       dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+    _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
+    _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
+    _fjsp_v2r8       itab_tmp;
+    _fjsp_v2r8       dummy_mask,cutoff_mask;
+    _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+    _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+    union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq0              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+0]));
+    iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+    iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+    vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+    jq0              = gmx_fjsp_set1_v2r8(charge[inr+0]);
+    jq1              = gmx_fjsp_set1_v2r8(charge[inr+1]);
+    jq2              = gmx_fjsp_set1_v2r8(charge[inr+2]);
+    vdwjidx0A        = 2*vdwtype[inr+0];
+    qq00             = _fjsp_mul_v2r8(iq0,jq0);
+    c6_00            = gmx_fjsp_set1_v2r8(vdwparam[vdwioffset0+vdwjidx0A]);
+    c12_00           = gmx_fjsp_set1_v2r8(vdwparam[vdwioffset0+vdwjidx0A+1]);
+    qq01             = _fjsp_mul_v2r8(iq0,jq1);
+    qq02             = _fjsp_mul_v2r8(iq0,jq2);
+    qq10             = _fjsp_mul_v2r8(iq1,jq0);
+    qq11             = _fjsp_mul_v2r8(iq1,jq1);
+    qq12             = _fjsp_mul_v2r8(iq1,jq2);
+    qq20             = _fjsp_mul_v2r8(iq2,jq0);
+    qq21             = _fjsp_mul_v2r8(iq2,jq1);
+    qq22             = _fjsp_mul_v2r8(iq2,jq2);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_fjsp_load_shift_and_3rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
+
+        fix0             = _fjsp_setzero_v2r8();
+        fiy0             = _fjsp_setzero_v2r8();
+        fiz0             = _fjsp_setzero_v2r8();
+        fix1             = _fjsp_setzero_v2r8();
+        fiy1             = _fjsp_setzero_v2r8();
+        fiz1             = _fjsp_setzero_v2r8();
+        fix2             = _fjsp_setzero_v2r8();
+        fiy2             = _fjsp_setzero_v2r8();
+        fiz2             = _fjsp_setzero_v2r8();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_3rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx01             = _fjsp_sub_v2r8(ix0,jx1);
+            dy01             = _fjsp_sub_v2r8(iy0,jy1);
+            dz01             = _fjsp_sub_v2r8(iz0,jz1);
+            dx02             = _fjsp_sub_v2r8(ix0,jx2);
+            dy02             = _fjsp_sub_v2r8(iy0,jy2);
+            dz02             = _fjsp_sub_v2r8(iz0,jz2);
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx11             = _fjsp_sub_v2r8(ix1,jx1);
+            dy11             = _fjsp_sub_v2r8(iy1,jy1);
+            dz11             = _fjsp_sub_v2r8(iz1,jz1);
+            dx12             = _fjsp_sub_v2r8(ix1,jx2);
+            dy12             = _fjsp_sub_v2r8(iy1,jy2);
+            dz12             = _fjsp_sub_v2r8(iz1,jz2);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+            dx21             = _fjsp_sub_v2r8(ix2,jx1);
+            dy21             = _fjsp_sub_v2r8(iy2,jy1);
+            dz21             = _fjsp_sub_v2r8(iz2,jz1);
+            dx22             = _fjsp_sub_v2r8(ix2,jx2);
+            dy22             = _fjsp_sub_v2r8(iy2,jy2);
+            dz22             = _fjsp_sub_v2r8(iz2,jz2);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq01            = gmx_fjsp_calc_rsq_v2r8(dx01,dy01,dz01);
+            rsq02            = gmx_fjsp_calc_rsq_v2r8(dx02,dy02,dz02);
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+            rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+            rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+            rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv01           = gmx_fjsp_invsqrt_v2r8(rsq01);
+            rinv02           = gmx_fjsp_invsqrt_v2r8(rsq02);
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+            rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+            rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+            rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+            rinvsq01         = _fjsp_mul_v2r8(rinv01,rinv01);
+            rinvsq02         = _fjsp_mul_v2r8(rinv02,rinv02);
+            rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+            rinvsq11         = _fjsp_mul_v2r8(rinv11,rinv11);
+            rinvsq12         = _fjsp_mul_v2r8(rinv12,rinv12);
+            rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+            rinvsq21         = _fjsp_mul_v2r8(rinv21,rinv21);
+            rinvsq22         = _fjsp_mul_v2r8(rinv22,rinv22);
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+            fjx1             = _fjsp_setzero_v2r8();
+            fjy1             = _fjsp_setzero_v2r8();
+            fjz1             = _fjsp_setzero_v2r8();
+            fjx2             = _fjsp_setzero_v2r8();
+            fjy2             = _fjsp_setzero_v2r8();
+            fjz2             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq00,rinv00);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq00);
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+            fvdw             = _fjsp_mul_v2r8(_fjsp_msub_v2r8(c12_00,rinvsix,c6_00),_fjsp_mul_v2r8(rinvsix,rinvsq00));
+
+            fscal            = _fjsp_add_v2r8(felec,fvdw);
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq01,rinv01);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq01);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx01,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy01,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz01,fscal,fiz0);
+            
+            fjx1             = _fjsp_madd_v2r8(dx01,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy01,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz01,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq02,rinv02);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq02);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx02,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy02,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz02,fscal,fiz0);
+            
+            fjx2             = _fjsp_madd_v2r8(dx02,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy02,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz02,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq10,rinv10);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq10);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq11,rinv11);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq11);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+            
+            fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq12,rinv12);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq12);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+            
+            fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq20,rinv20);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq20);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq21,rinv21);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq21);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+            
+            fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq22,rinv22);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq22);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+            
+            fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+
+            gmx_fjsp_decrement_3rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+
+            /* Inner loop uses 277 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_3rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx01             = _fjsp_sub_v2r8(ix0,jx1);
+            dy01             = _fjsp_sub_v2r8(iy0,jy1);
+            dz01             = _fjsp_sub_v2r8(iz0,jz1);
+            dx02             = _fjsp_sub_v2r8(ix0,jx2);
+            dy02             = _fjsp_sub_v2r8(iy0,jy2);
+            dz02             = _fjsp_sub_v2r8(iz0,jz2);
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx11             = _fjsp_sub_v2r8(ix1,jx1);
+            dy11             = _fjsp_sub_v2r8(iy1,jy1);
+            dz11             = _fjsp_sub_v2r8(iz1,jz1);
+            dx12             = _fjsp_sub_v2r8(ix1,jx2);
+            dy12             = _fjsp_sub_v2r8(iy1,jy2);
+            dz12             = _fjsp_sub_v2r8(iz1,jz2);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+            dx21             = _fjsp_sub_v2r8(ix2,jx1);
+            dy21             = _fjsp_sub_v2r8(iy2,jy1);
+            dz21             = _fjsp_sub_v2r8(iz2,jz1);
+            dx22             = _fjsp_sub_v2r8(ix2,jx2);
+            dy22             = _fjsp_sub_v2r8(iy2,jy2);
+            dz22             = _fjsp_sub_v2r8(iz2,jz2);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq01            = gmx_fjsp_calc_rsq_v2r8(dx01,dy01,dz01);
+            rsq02            = gmx_fjsp_calc_rsq_v2r8(dx02,dy02,dz02);
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+            rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+            rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+            rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv01           = gmx_fjsp_invsqrt_v2r8(rsq01);
+            rinv02           = gmx_fjsp_invsqrt_v2r8(rsq02);
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+            rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+            rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+            rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+            rinvsq01         = _fjsp_mul_v2r8(rinv01,rinv01);
+            rinvsq02         = _fjsp_mul_v2r8(rinv02,rinv02);
+            rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+            rinvsq11         = _fjsp_mul_v2r8(rinv11,rinv11);
+            rinvsq12         = _fjsp_mul_v2r8(rinv12,rinv12);
+            rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+            rinvsq21         = _fjsp_mul_v2r8(rinv21,rinv21);
+            rinvsq22         = _fjsp_mul_v2r8(rinv22,rinv22);
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+            fjx1             = _fjsp_setzero_v2r8();
+            fjy1             = _fjsp_setzero_v2r8();
+            fjz1             = _fjsp_setzero_v2r8();
+            fjx2             = _fjsp_setzero_v2r8();
+            fjy2             = _fjsp_setzero_v2r8();
+            fjz2             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq00,rinv00);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq00);
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+            fvdw             = _fjsp_mul_v2r8(_fjsp_msub_v2r8(c12_00,rinvsix,c6_00),_fjsp_mul_v2r8(rinvsix,rinvsq00));
+
+            fscal            = _fjsp_add_v2r8(felec,fvdw);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq01,rinv01);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq01);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx01,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy01,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz01,fscal,fiz0);
+            
+            fjx1             = _fjsp_madd_v2r8(dx01,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy01,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz01,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq02,rinv02);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq02);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx02,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy02,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz02,fscal,fiz0);
+            
+            fjx2             = _fjsp_madd_v2r8(dx02,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy02,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz02,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq10,rinv10);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq10);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq11,rinv11);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq11);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+            
+            fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq12,rinv12);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq12);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+            
+            fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq20,rinv20);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq20);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq21,rinv21);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq21);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+            
+            fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq22,rinv22);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq22);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+            
+            fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+
+            gmx_fjsp_decrement_3rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+
+            /* Inner loop uses 277 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_fjsp_update_iforce_3atom_swizzle_v2r8(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 18 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_F,outeriter*18 + inneriter*277);
+}
diff --git a/src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecCoul_VdwLJ_GeomW4P1_sparc64_hpc_ace_double.c b/src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecCoul_VdwLJ_GeomW4P1_sparc64_hpc_ace_double.c
new file mode 100644 (file)
index 0000000..a52b05d
--- /dev/null
@@ -0,0 +1,963 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2012, by the GROMACS development team, led by
+ * David van der Spoel, Berk Hess, Erik Lindahl, and including many
+ * others, as listed in the AUTHORS file in the top-level source
+ * directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*
+ * Note: this file was generated by the GROMACS sparc64_hpc_ace_double kernel generator.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "kernelutil_sparc64_hpc_ace_double.h"
+
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecCoul_VdwLJ_GeomW4P1_VF_sparc64_hpc_ace_double
+ * Electrostatics interaction: Coulomb
+ * VdW interaction:            LennardJones
+ * Geometry:                   Water4-Particle
+ * Calculate force/pot:        PotentialAndForce
+ */
+void
+nb_kernel_ElecCoul_VdwLJ_GeomW4P1_VF_sparc64_hpc_ace_double
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwioffset3;
+    _fjsp_v2r8       ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+    int              vdwjidx0A,vdwjidx0B;
+    _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    _fjsp_v2r8       dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    _fjsp_v2r8       dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    _fjsp_v2r8       dx30,dy30,dz30,rsq30,rinv30,rinvsq30,r30,qq30,c6_30,c12_30;
+    _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
+    _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
+    _fjsp_v2r8       itab_tmp;
+    _fjsp_v2r8       dummy_mask,cutoff_mask;
+    _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+    _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+    union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+    iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+    iq3              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+3]));
+    vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_fjsp_load_shift_and_4rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
+
+        fix0             = _fjsp_setzero_v2r8();
+        fiy0             = _fjsp_setzero_v2r8();
+        fiz0             = _fjsp_setzero_v2r8();
+        fix1             = _fjsp_setzero_v2r8();
+        fiy1             = _fjsp_setzero_v2r8();
+        fiz1             = _fjsp_setzero_v2r8();
+        fix2             = _fjsp_setzero_v2r8();
+        fiy2             = _fjsp_setzero_v2r8();
+        fiz2             = _fjsp_setzero_v2r8();
+        fix3             = _fjsp_setzero_v2r8();
+        fiy3             = _fjsp_setzero_v2r8();
+        fiz3             = _fjsp_setzero_v2r8();
+
+        /* Reset potential sums */
+        velecsum         = _fjsp_setzero_v2r8();
+        vvdwsum          = _fjsp_setzero_v2r8();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+            dx30             = _fjsp_sub_v2r8(ix3,jx0);
+            dy30             = _fjsp_sub_v2r8(iy3,jy0);
+            dz30             = _fjsp_sub_v2r8(iz3,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+            rsq30            = gmx_fjsp_calc_rsq_v2r8(dx30,dy30,dz30);
+
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+            rinv30           = gmx_fjsp_invsqrt_v2r8(rsq30);
+
+            rinvsq00         = gmx_fjsp_inv_v2r8(rsq00);
+            rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+            rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+            rinvsq30         = _fjsp_mul_v2r8(rinv30,rinv30);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_fjsp_load_2real_swizzle_v2r8(charge+jnrA+0,charge+jnrB+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            gmx_fjsp_load_2pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,&c6_00,&c12_00);
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+            vvdw6            = _fjsp_mul_v2r8(c6_00,rinvsix);
+            vvdw12           = _fjsp_mul_v2r8(c12_00,_fjsp_mul_v2r8(rinvsix,rinvsix));
+            vvdw             = _fjsp_msub_v2r8( vvdw12,one_twelfth, _fjsp_mul_v2r8(vvdw6,one_sixth) );
+            fvdw             = _fjsp_mul_v2r8(_fjsp_sub_v2r8(vvdw12,vvdw6),rinvsq00);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
+
+            fscal            = fvdw;
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _fjsp_mul_v2r8(iq1,jq0);
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq10,rinv10);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq10);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _fjsp_mul_v2r8(iq2,jq0);
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq20,rinv20);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq20);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq30             = _fjsp_mul_v2r8(iq3,jq0);
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq30,rinv30);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq30);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx30,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy30,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz30,fscal,fiz3);
+            
+            fjx0             = _fjsp_madd_v2r8(dx30,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy30,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz30,fscal,fjz0);
+
+            gmx_fjsp_decrement_1rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 131 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+            dx30             = _fjsp_sub_v2r8(ix3,jx0);
+            dy30             = _fjsp_sub_v2r8(iy3,jy0);
+            dz30             = _fjsp_sub_v2r8(iz3,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+            rsq30            = gmx_fjsp_calc_rsq_v2r8(dx30,dy30,dz30);
+
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+            rinv30           = gmx_fjsp_invsqrt_v2r8(rsq30);
+
+            rinvsq00         = gmx_fjsp_inv_v2r8(rsq00);
+            rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+            rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+            rinvsq30         = _fjsp_mul_v2r8(rinv30,rinv30);
+
+            /* Load parameters for j particles */
+            jq0              = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),charge+jnrA+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            gmx_fjsp_load_1pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,&c6_00,&c12_00);
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+            vvdw6            = _fjsp_mul_v2r8(c6_00,rinvsix);
+            vvdw12           = _fjsp_mul_v2r8(c12_00,_fjsp_mul_v2r8(rinvsix,rinvsix));
+            vvdw             = _fjsp_msub_v2r8( vvdw12,one_twelfth, _fjsp_mul_v2r8(vvdw6,one_sixth) );
+            fvdw             = _fjsp_mul_v2r8(_fjsp_sub_v2r8(vvdw12,vvdw6),rinvsq00);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            vvdw             = _fjsp_unpacklo_v2r8(vvdw,_fjsp_setzero_v2r8());
+            vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
+
+            fscal            = fvdw;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _fjsp_mul_v2r8(iq1,jq0);
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq10,rinv10);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq10);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _fjsp_mul_v2r8(iq2,jq0);
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq20,rinv20);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq20);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq30             = _fjsp_mul_v2r8(iq3,jq0);
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq30,rinv30);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq30);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx30,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy30,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz30,fscal,fiz3);
+            
+            fjx0             = _fjsp_madd_v2r8(dx30,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy30,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz30,fscal,fjz0);
+
+            gmx_fjsp_decrement_1rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 131 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_fjsp_update_iforce_4atom_swizzle_v2r8(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        ggid                        = gid[iidx];
+        /* Update potential energies */
+        gmx_fjsp_update_1pot_v2r8(velecsum,kernel_data->energygrp_elec+ggid);
+        gmx_fjsp_update_1pot_v2r8(vvdwsum,kernel_data->energygrp_vdw+ggid);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 26 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4_VF,outeriter*26 + inneriter*131);
+}
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecCoul_VdwLJ_GeomW4P1_F_sparc64_hpc_ace_double
+ * Electrostatics interaction: Coulomb
+ * VdW interaction:            LennardJones
+ * Geometry:                   Water4-Particle
+ * Calculate force/pot:        Force
+ */
+void
+nb_kernel_ElecCoul_VdwLJ_GeomW4P1_F_sparc64_hpc_ace_double
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwioffset3;
+    _fjsp_v2r8       ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+    int              vdwjidx0A,vdwjidx0B;
+    _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    _fjsp_v2r8       dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    _fjsp_v2r8       dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    _fjsp_v2r8       dx30,dy30,dz30,rsq30,rinv30,rinvsq30,r30,qq30,c6_30,c12_30;
+    _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
+    _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
+    _fjsp_v2r8       itab_tmp;
+    _fjsp_v2r8       dummy_mask,cutoff_mask;
+    _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+    _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+    union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+    iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+    iq3              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+3]));
+    vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_fjsp_load_shift_and_4rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
+
+        fix0             = _fjsp_setzero_v2r8();
+        fiy0             = _fjsp_setzero_v2r8();
+        fiz0             = _fjsp_setzero_v2r8();
+        fix1             = _fjsp_setzero_v2r8();
+        fiy1             = _fjsp_setzero_v2r8();
+        fiz1             = _fjsp_setzero_v2r8();
+        fix2             = _fjsp_setzero_v2r8();
+        fiy2             = _fjsp_setzero_v2r8();
+        fiz2             = _fjsp_setzero_v2r8();
+        fix3             = _fjsp_setzero_v2r8();
+        fiy3             = _fjsp_setzero_v2r8();
+        fiz3             = _fjsp_setzero_v2r8();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+            dx30             = _fjsp_sub_v2r8(ix3,jx0);
+            dy30             = _fjsp_sub_v2r8(iy3,jy0);
+            dz30             = _fjsp_sub_v2r8(iz3,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+            rsq30            = gmx_fjsp_calc_rsq_v2r8(dx30,dy30,dz30);
+
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+            rinv30           = gmx_fjsp_invsqrt_v2r8(rsq30);
+
+            rinvsq00         = gmx_fjsp_inv_v2r8(rsq00);
+            rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+            rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+            rinvsq30         = _fjsp_mul_v2r8(rinv30,rinv30);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_fjsp_load_2real_swizzle_v2r8(charge+jnrA+0,charge+jnrB+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            gmx_fjsp_load_2pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,&c6_00,&c12_00);
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+            fvdw             = _fjsp_mul_v2r8(_fjsp_msub_v2r8(c12_00,rinvsix,c6_00),_fjsp_mul_v2r8(rinvsix,rinvsq00));
+
+            fscal            = fvdw;
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _fjsp_mul_v2r8(iq1,jq0);
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq10,rinv10);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq10);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _fjsp_mul_v2r8(iq2,jq0);
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq20,rinv20);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq20);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq30             = _fjsp_mul_v2r8(iq3,jq0);
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq30,rinv30);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq30);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx30,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy30,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz30,fscal,fiz3);
+            
+            fjx0             = _fjsp_madd_v2r8(dx30,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy30,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz30,fscal,fjz0);
+
+            gmx_fjsp_decrement_1rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 123 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+            dx30             = _fjsp_sub_v2r8(ix3,jx0);
+            dy30             = _fjsp_sub_v2r8(iy3,jy0);
+            dz30             = _fjsp_sub_v2r8(iz3,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+            rsq30            = gmx_fjsp_calc_rsq_v2r8(dx30,dy30,dz30);
+
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+            rinv30           = gmx_fjsp_invsqrt_v2r8(rsq30);
+
+            rinvsq00         = gmx_fjsp_inv_v2r8(rsq00);
+            rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+            rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+            rinvsq30         = _fjsp_mul_v2r8(rinv30,rinv30);
+
+            /* Load parameters for j particles */
+            jq0              = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),charge+jnrA+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            gmx_fjsp_load_1pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,&c6_00,&c12_00);
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+            fvdw             = _fjsp_mul_v2r8(_fjsp_msub_v2r8(c12_00,rinvsix,c6_00),_fjsp_mul_v2r8(rinvsix,rinvsq00));
+
+            fscal            = fvdw;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _fjsp_mul_v2r8(iq1,jq0);
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq10,rinv10);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq10);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _fjsp_mul_v2r8(iq2,jq0);
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq20,rinv20);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq20);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq30             = _fjsp_mul_v2r8(iq3,jq0);
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq30,rinv30);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq30);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx30,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy30,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz30,fscal,fiz3);
+            
+            fjx0             = _fjsp_madd_v2r8(dx30,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy30,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz30,fscal,fjz0);
+
+            gmx_fjsp_decrement_1rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 123 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_fjsp_update_iforce_4atom_swizzle_v2r8(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 24 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4_F,outeriter*24 + inneriter*123);
+}
diff --git a/src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecCoul_VdwLJ_GeomW4W4_sparc64_hpc_ace_double.c b/src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecCoul_VdwLJ_GeomW4W4_sparc64_hpc_ace_double.c
new file mode 100644 (file)
index 0000000..44e3580
--- /dev/null
@@ -0,0 +1,1657 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2012, by the GROMACS development team, led by
+ * David van der Spoel, Berk Hess, Erik Lindahl, and including many
+ * others, as listed in the AUTHORS file in the top-level source
+ * directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*
+ * Note: this file was generated by the GROMACS sparc64_hpc_ace_double kernel generator.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "kernelutil_sparc64_hpc_ace_double.h"
+
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecCoul_VdwLJ_GeomW4W4_VF_sparc64_hpc_ace_double
+ * Electrostatics interaction: Coulomb
+ * VdW interaction:            LennardJones
+ * Geometry:                   Water4-Water4
+ * Calculate force/pot:        PotentialAndForce
+ */
+void
+nb_kernel_ElecCoul_VdwLJ_GeomW4W4_VF_sparc64_hpc_ace_double
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwioffset3;
+    _fjsp_v2r8       ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+    int              vdwjidx0A,vdwjidx0B;
+    _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    int              vdwjidx1A,vdwjidx1B;
+    _fjsp_v2r8       jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+    int              vdwjidx2A,vdwjidx2B;
+    _fjsp_v2r8       jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+    int              vdwjidx3A,vdwjidx3B;
+    _fjsp_v2r8       jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
+    _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    _fjsp_v2r8       dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+    _fjsp_v2r8       dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+    _fjsp_v2r8       dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
+    _fjsp_v2r8       dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+    _fjsp_v2r8       dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+    _fjsp_v2r8       dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
+    _fjsp_v2r8       dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
+    _fjsp_v2r8       dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
+    _fjsp_v2r8       dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
+    _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
+    _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
+    _fjsp_v2r8       itab_tmp;
+    _fjsp_v2r8       dummy_mask,cutoff_mask;
+    _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+    _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+    union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+    iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+    iq3              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+3]));
+    vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+    jq1              = gmx_fjsp_set1_v2r8(charge[inr+1]);
+    jq2              = gmx_fjsp_set1_v2r8(charge[inr+2]);
+    jq3              = gmx_fjsp_set1_v2r8(charge[inr+3]);
+    vdwjidx0A        = 2*vdwtype[inr+0];
+    c6_00            = gmx_fjsp_set1_v2r8(vdwparam[vdwioffset0+vdwjidx0A]);
+    c12_00           = gmx_fjsp_set1_v2r8(vdwparam[vdwioffset0+vdwjidx0A+1]);
+    qq11             = _fjsp_mul_v2r8(iq1,jq1);
+    qq12             = _fjsp_mul_v2r8(iq1,jq2);
+    qq13             = _fjsp_mul_v2r8(iq1,jq3);
+    qq21             = _fjsp_mul_v2r8(iq2,jq1);
+    qq22             = _fjsp_mul_v2r8(iq2,jq2);
+    qq23             = _fjsp_mul_v2r8(iq2,jq3);
+    qq31             = _fjsp_mul_v2r8(iq3,jq1);
+    qq32             = _fjsp_mul_v2r8(iq3,jq2);
+    qq33             = _fjsp_mul_v2r8(iq3,jq3);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_fjsp_load_shift_and_4rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
+
+        fix0             = _fjsp_setzero_v2r8();
+        fiy0             = _fjsp_setzero_v2r8();
+        fiz0             = _fjsp_setzero_v2r8();
+        fix1             = _fjsp_setzero_v2r8();
+        fiy1             = _fjsp_setzero_v2r8();
+        fiz1             = _fjsp_setzero_v2r8();
+        fix2             = _fjsp_setzero_v2r8();
+        fiy2             = _fjsp_setzero_v2r8();
+        fiz2             = _fjsp_setzero_v2r8();
+        fix3             = _fjsp_setzero_v2r8();
+        fiy3             = _fjsp_setzero_v2r8();
+        fiz3             = _fjsp_setzero_v2r8();
+
+        /* Reset potential sums */
+        velecsum         = _fjsp_setzero_v2r8();
+        vvdwsum          = _fjsp_setzero_v2r8();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_4rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
+                                              &jy2,&jz2,&jx3,&jy3,&jz3);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx11             = _fjsp_sub_v2r8(ix1,jx1);
+            dy11             = _fjsp_sub_v2r8(iy1,jy1);
+            dz11             = _fjsp_sub_v2r8(iz1,jz1);
+            dx12             = _fjsp_sub_v2r8(ix1,jx2);
+            dy12             = _fjsp_sub_v2r8(iy1,jy2);
+            dz12             = _fjsp_sub_v2r8(iz1,jz2);
+            dx13             = _fjsp_sub_v2r8(ix1,jx3);
+            dy13             = _fjsp_sub_v2r8(iy1,jy3);
+            dz13             = _fjsp_sub_v2r8(iz1,jz3);
+            dx21             = _fjsp_sub_v2r8(ix2,jx1);
+            dy21             = _fjsp_sub_v2r8(iy2,jy1);
+            dz21             = _fjsp_sub_v2r8(iz2,jz1);
+            dx22             = _fjsp_sub_v2r8(ix2,jx2);
+            dy22             = _fjsp_sub_v2r8(iy2,jy2);
+            dz22             = _fjsp_sub_v2r8(iz2,jz2);
+            dx23             = _fjsp_sub_v2r8(ix2,jx3);
+            dy23             = _fjsp_sub_v2r8(iy2,jy3);
+            dz23             = _fjsp_sub_v2r8(iz2,jz3);
+            dx31             = _fjsp_sub_v2r8(ix3,jx1);
+            dy31             = _fjsp_sub_v2r8(iy3,jy1);
+            dz31             = _fjsp_sub_v2r8(iz3,jz1);
+            dx32             = _fjsp_sub_v2r8(ix3,jx2);
+            dy32             = _fjsp_sub_v2r8(iy3,jy2);
+            dz32             = _fjsp_sub_v2r8(iz3,jz2);
+            dx33             = _fjsp_sub_v2r8(ix3,jx3);
+            dy33             = _fjsp_sub_v2r8(iy3,jy3);
+            dz33             = _fjsp_sub_v2r8(iz3,jz3);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+            rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+            rsq13            = gmx_fjsp_calc_rsq_v2r8(dx13,dy13,dz13);
+            rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+            rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+            rsq23            = gmx_fjsp_calc_rsq_v2r8(dx23,dy23,dz23);
+            rsq31            = gmx_fjsp_calc_rsq_v2r8(dx31,dy31,dz31);
+            rsq32            = gmx_fjsp_calc_rsq_v2r8(dx32,dy32,dz32);
+            rsq33            = gmx_fjsp_calc_rsq_v2r8(dx33,dy33,dz33);
+
+            rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+            rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+            rinv13           = gmx_fjsp_invsqrt_v2r8(rsq13);
+            rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+            rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+            rinv23           = gmx_fjsp_invsqrt_v2r8(rsq23);
+            rinv31           = gmx_fjsp_invsqrt_v2r8(rsq31);
+            rinv32           = gmx_fjsp_invsqrt_v2r8(rsq32);
+            rinv33           = gmx_fjsp_invsqrt_v2r8(rsq33);
+
+            rinvsq00         = gmx_fjsp_inv_v2r8(rsq00);
+            rinvsq11         = _fjsp_mul_v2r8(rinv11,rinv11);
+            rinvsq12         = _fjsp_mul_v2r8(rinv12,rinv12);
+            rinvsq13         = _fjsp_mul_v2r8(rinv13,rinv13);
+            rinvsq21         = _fjsp_mul_v2r8(rinv21,rinv21);
+            rinvsq22         = _fjsp_mul_v2r8(rinv22,rinv22);
+            rinvsq23         = _fjsp_mul_v2r8(rinv23,rinv23);
+            rinvsq31         = _fjsp_mul_v2r8(rinv31,rinv31);
+            rinvsq32         = _fjsp_mul_v2r8(rinv32,rinv32);
+            rinvsq33         = _fjsp_mul_v2r8(rinv33,rinv33);
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+            fjx1             = _fjsp_setzero_v2r8();
+            fjy1             = _fjsp_setzero_v2r8();
+            fjz1             = _fjsp_setzero_v2r8();
+            fjx2             = _fjsp_setzero_v2r8();
+            fjy2             = _fjsp_setzero_v2r8();
+            fjz2             = _fjsp_setzero_v2r8();
+            fjx3             = _fjsp_setzero_v2r8();
+            fjy3             = _fjsp_setzero_v2r8();
+            fjz3             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+            vvdw6            = _fjsp_mul_v2r8(c6_00,rinvsix);
+            vvdw12           = _fjsp_mul_v2r8(c12_00,_fjsp_mul_v2r8(rinvsix,rinvsix));
+            vvdw             = _fjsp_msub_v2r8( vvdw12,one_twelfth, _fjsp_mul_v2r8(vvdw6,one_sixth) );
+            fvdw             = _fjsp_mul_v2r8(_fjsp_sub_v2r8(vvdw12,vvdw6),rinvsq00);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
+
+            fscal            = fvdw;
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq11,rinv11);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq11);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+            
+            fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq12,rinv12);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq12);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+            
+            fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq13,rinv13);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq13);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx13,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy13,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz13,fscal,fiz1);
+            
+            fjx3             = _fjsp_madd_v2r8(dx13,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy13,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz13,fscal,fjz3);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq21,rinv21);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq21);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+            
+            fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq22,rinv22);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq22);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+            
+            fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq23,rinv23);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq23);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx23,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy23,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz23,fscal,fiz2);
+            
+            fjx3             = _fjsp_madd_v2r8(dx23,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy23,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz23,fscal,fjz3);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq31,rinv31);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq31);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx31,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy31,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz31,fscal,fiz3);
+            
+            fjx1             = _fjsp_madd_v2r8(dx31,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy31,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz31,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq32,rinv32);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq32);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx32,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy32,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz32,fscal,fiz3);
+            
+            fjx2             = _fjsp_madd_v2r8(dx32,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy32,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz32,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq33,rinv33);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq33);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx33,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy33,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz33,fscal,fiz3);
+            
+            fjx3             = _fjsp_madd_v2r8(dx33,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy33,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz33,fscal,fjz3);
+
+            gmx_fjsp_decrement_4rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+
+            /* Inner loop uses 317 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_4rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
+                                              &jy2,&jz2,&jx3,&jy3,&jz3);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx11             = _fjsp_sub_v2r8(ix1,jx1);
+            dy11             = _fjsp_sub_v2r8(iy1,jy1);
+            dz11             = _fjsp_sub_v2r8(iz1,jz1);
+            dx12             = _fjsp_sub_v2r8(ix1,jx2);
+            dy12             = _fjsp_sub_v2r8(iy1,jy2);
+            dz12             = _fjsp_sub_v2r8(iz1,jz2);
+            dx13             = _fjsp_sub_v2r8(ix1,jx3);
+            dy13             = _fjsp_sub_v2r8(iy1,jy3);
+            dz13             = _fjsp_sub_v2r8(iz1,jz3);
+            dx21             = _fjsp_sub_v2r8(ix2,jx1);
+            dy21             = _fjsp_sub_v2r8(iy2,jy1);
+            dz21             = _fjsp_sub_v2r8(iz2,jz1);
+            dx22             = _fjsp_sub_v2r8(ix2,jx2);
+            dy22             = _fjsp_sub_v2r8(iy2,jy2);
+            dz22             = _fjsp_sub_v2r8(iz2,jz2);
+            dx23             = _fjsp_sub_v2r8(ix2,jx3);
+            dy23             = _fjsp_sub_v2r8(iy2,jy3);
+            dz23             = _fjsp_sub_v2r8(iz2,jz3);
+            dx31             = _fjsp_sub_v2r8(ix3,jx1);
+            dy31             = _fjsp_sub_v2r8(iy3,jy1);
+            dz31             = _fjsp_sub_v2r8(iz3,jz1);
+            dx32             = _fjsp_sub_v2r8(ix3,jx2);
+            dy32             = _fjsp_sub_v2r8(iy3,jy2);
+            dz32             = _fjsp_sub_v2r8(iz3,jz2);
+            dx33             = _fjsp_sub_v2r8(ix3,jx3);
+            dy33             = _fjsp_sub_v2r8(iy3,jy3);
+            dz33             = _fjsp_sub_v2r8(iz3,jz3);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+            rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+            rsq13            = gmx_fjsp_calc_rsq_v2r8(dx13,dy13,dz13);
+            rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+            rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+            rsq23            = gmx_fjsp_calc_rsq_v2r8(dx23,dy23,dz23);
+            rsq31            = gmx_fjsp_calc_rsq_v2r8(dx31,dy31,dz31);
+            rsq32            = gmx_fjsp_calc_rsq_v2r8(dx32,dy32,dz32);
+            rsq33            = gmx_fjsp_calc_rsq_v2r8(dx33,dy33,dz33);
+
+            rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+            rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+            rinv13           = gmx_fjsp_invsqrt_v2r8(rsq13);
+            rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+            rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+            rinv23           = gmx_fjsp_invsqrt_v2r8(rsq23);
+            rinv31           = gmx_fjsp_invsqrt_v2r8(rsq31);
+            rinv32           = gmx_fjsp_invsqrt_v2r8(rsq32);
+            rinv33           = gmx_fjsp_invsqrt_v2r8(rsq33);
+
+            rinvsq00         = gmx_fjsp_inv_v2r8(rsq00);
+            rinvsq11         = _fjsp_mul_v2r8(rinv11,rinv11);
+            rinvsq12         = _fjsp_mul_v2r8(rinv12,rinv12);
+            rinvsq13         = _fjsp_mul_v2r8(rinv13,rinv13);
+            rinvsq21         = _fjsp_mul_v2r8(rinv21,rinv21);
+            rinvsq22         = _fjsp_mul_v2r8(rinv22,rinv22);
+            rinvsq23         = _fjsp_mul_v2r8(rinv23,rinv23);
+            rinvsq31         = _fjsp_mul_v2r8(rinv31,rinv31);
+            rinvsq32         = _fjsp_mul_v2r8(rinv32,rinv32);
+            rinvsq33         = _fjsp_mul_v2r8(rinv33,rinv33);
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+            fjx1             = _fjsp_setzero_v2r8();
+            fjy1             = _fjsp_setzero_v2r8();
+            fjz1             = _fjsp_setzero_v2r8();
+            fjx2             = _fjsp_setzero_v2r8();
+            fjy2             = _fjsp_setzero_v2r8();
+            fjz2             = _fjsp_setzero_v2r8();
+            fjx3             = _fjsp_setzero_v2r8();
+            fjy3             = _fjsp_setzero_v2r8();
+            fjz3             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+            vvdw6            = _fjsp_mul_v2r8(c6_00,rinvsix);
+            vvdw12           = _fjsp_mul_v2r8(c12_00,_fjsp_mul_v2r8(rinvsix,rinvsix));
+            vvdw             = _fjsp_msub_v2r8( vvdw12,one_twelfth, _fjsp_mul_v2r8(vvdw6,one_sixth) );
+            fvdw             = _fjsp_mul_v2r8(_fjsp_sub_v2r8(vvdw12,vvdw6),rinvsq00);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            vvdw             = _fjsp_unpacklo_v2r8(vvdw,_fjsp_setzero_v2r8());
+            vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
+
+            fscal            = fvdw;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq11,rinv11);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq11);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+            
+            fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq12,rinv12);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq12);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+            
+            fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq13,rinv13);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq13);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx13,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy13,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz13,fscal,fiz1);
+            
+            fjx3             = _fjsp_madd_v2r8(dx13,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy13,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz13,fscal,fjz3);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq21,rinv21);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq21);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+            
+            fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq22,rinv22);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq22);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+            
+            fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq23,rinv23);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq23);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx23,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy23,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz23,fscal,fiz2);
+            
+            fjx3             = _fjsp_madd_v2r8(dx23,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy23,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz23,fscal,fjz3);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq31,rinv31);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq31);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx31,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy31,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz31,fscal,fiz3);
+            
+            fjx1             = _fjsp_madd_v2r8(dx31,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy31,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz31,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq32,rinv32);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq32);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx32,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy32,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz32,fscal,fiz3);
+            
+            fjx2             = _fjsp_madd_v2r8(dx32,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy32,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz32,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq33,rinv33);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq33);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx33,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy33,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz33,fscal,fiz3);
+            
+            fjx3             = _fjsp_madd_v2r8(dx33,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy33,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz33,fscal,fjz3);
+
+            gmx_fjsp_decrement_4rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+
+            /* Inner loop uses 317 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_fjsp_update_iforce_4atom_swizzle_v2r8(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        ggid                        = gid[iidx];
+        /* Update potential energies */
+        gmx_fjsp_update_1pot_v2r8(velecsum,kernel_data->energygrp_elec+ggid);
+        gmx_fjsp_update_1pot_v2r8(vvdwsum,kernel_data->energygrp_vdw+ggid);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 26 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_VF,outeriter*26 + inneriter*317);
+}
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecCoul_VdwLJ_GeomW4W4_F_sparc64_hpc_ace_double
+ * Electrostatics interaction: Coulomb
+ * VdW interaction:            LennardJones
+ * Geometry:                   Water4-Water4
+ * Calculate force/pot:        Force
+ */
+void
+nb_kernel_ElecCoul_VdwLJ_GeomW4W4_F_sparc64_hpc_ace_double
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwioffset3;
+    _fjsp_v2r8       ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+    int              vdwjidx0A,vdwjidx0B;
+    _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    int              vdwjidx1A,vdwjidx1B;
+    _fjsp_v2r8       jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+    int              vdwjidx2A,vdwjidx2B;
+    _fjsp_v2r8       jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+    int              vdwjidx3A,vdwjidx3B;
+    _fjsp_v2r8       jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
+    _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    _fjsp_v2r8       dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+    _fjsp_v2r8       dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+    _fjsp_v2r8       dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
+    _fjsp_v2r8       dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+    _fjsp_v2r8       dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+    _fjsp_v2r8       dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
+    _fjsp_v2r8       dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
+    _fjsp_v2r8       dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
+    _fjsp_v2r8       dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
+    _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
+    _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
+    _fjsp_v2r8       itab_tmp;
+    _fjsp_v2r8       dummy_mask,cutoff_mask;
+    _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+    _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+    union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+    iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+    iq3              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+3]));
+    vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+    jq1              = gmx_fjsp_set1_v2r8(charge[inr+1]);
+    jq2              = gmx_fjsp_set1_v2r8(charge[inr+2]);
+    jq3              = gmx_fjsp_set1_v2r8(charge[inr+3]);
+    vdwjidx0A        = 2*vdwtype[inr+0];
+    c6_00            = gmx_fjsp_set1_v2r8(vdwparam[vdwioffset0+vdwjidx0A]);
+    c12_00           = gmx_fjsp_set1_v2r8(vdwparam[vdwioffset0+vdwjidx0A+1]);
+    qq11             = _fjsp_mul_v2r8(iq1,jq1);
+    qq12             = _fjsp_mul_v2r8(iq1,jq2);
+    qq13             = _fjsp_mul_v2r8(iq1,jq3);
+    qq21             = _fjsp_mul_v2r8(iq2,jq1);
+    qq22             = _fjsp_mul_v2r8(iq2,jq2);
+    qq23             = _fjsp_mul_v2r8(iq2,jq3);
+    qq31             = _fjsp_mul_v2r8(iq3,jq1);
+    qq32             = _fjsp_mul_v2r8(iq3,jq2);
+    qq33             = _fjsp_mul_v2r8(iq3,jq3);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_fjsp_load_shift_and_4rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
+
+        fix0             = _fjsp_setzero_v2r8();
+        fiy0             = _fjsp_setzero_v2r8();
+        fiz0             = _fjsp_setzero_v2r8();
+        fix1             = _fjsp_setzero_v2r8();
+        fiy1             = _fjsp_setzero_v2r8();
+        fiz1             = _fjsp_setzero_v2r8();
+        fix2             = _fjsp_setzero_v2r8();
+        fiy2             = _fjsp_setzero_v2r8();
+        fiz2             = _fjsp_setzero_v2r8();
+        fix3             = _fjsp_setzero_v2r8();
+        fiy3             = _fjsp_setzero_v2r8();
+        fiz3             = _fjsp_setzero_v2r8();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_4rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
+                                              &jy2,&jz2,&jx3,&jy3,&jz3);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx11             = _fjsp_sub_v2r8(ix1,jx1);
+            dy11             = _fjsp_sub_v2r8(iy1,jy1);
+            dz11             = _fjsp_sub_v2r8(iz1,jz1);
+            dx12             = _fjsp_sub_v2r8(ix1,jx2);
+            dy12             = _fjsp_sub_v2r8(iy1,jy2);
+            dz12             = _fjsp_sub_v2r8(iz1,jz2);
+            dx13             = _fjsp_sub_v2r8(ix1,jx3);
+            dy13             = _fjsp_sub_v2r8(iy1,jy3);
+            dz13             = _fjsp_sub_v2r8(iz1,jz3);
+            dx21             = _fjsp_sub_v2r8(ix2,jx1);
+            dy21             = _fjsp_sub_v2r8(iy2,jy1);
+            dz21             = _fjsp_sub_v2r8(iz2,jz1);
+            dx22             = _fjsp_sub_v2r8(ix2,jx2);
+            dy22             = _fjsp_sub_v2r8(iy2,jy2);
+            dz22             = _fjsp_sub_v2r8(iz2,jz2);
+            dx23             = _fjsp_sub_v2r8(ix2,jx3);
+            dy23             = _fjsp_sub_v2r8(iy2,jy3);
+            dz23             = _fjsp_sub_v2r8(iz2,jz3);
+            dx31             = _fjsp_sub_v2r8(ix3,jx1);
+            dy31             = _fjsp_sub_v2r8(iy3,jy1);
+            dz31             = _fjsp_sub_v2r8(iz3,jz1);
+            dx32             = _fjsp_sub_v2r8(ix3,jx2);
+            dy32             = _fjsp_sub_v2r8(iy3,jy2);
+            dz32             = _fjsp_sub_v2r8(iz3,jz2);
+            dx33             = _fjsp_sub_v2r8(ix3,jx3);
+            dy33             = _fjsp_sub_v2r8(iy3,jy3);
+            dz33             = _fjsp_sub_v2r8(iz3,jz3);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+            rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+            rsq13            = gmx_fjsp_calc_rsq_v2r8(dx13,dy13,dz13);
+            rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+            rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+            rsq23            = gmx_fjsp_calc_rsq_v2r8(dx23,dy23,dz23);
+            rsq31            = gmx_fjsp_calc_rsq_v2r8(dx31,dy31,dz31);
+            rsq32            = gmx_fjsp_calc_rsq_v2r8(dx32,dy32,dz32);
+            rsq33            = gmx_fjsp_calc_rsq_v2r8(dx33,dy33,dz33);
+
+            rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+            rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+            rinv13           = gmx_fjsp_invsqrt_v2r8(rsq13);
+            rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+            rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+            rinv23           = gmx_fjsp_invsqrt_v2r8(rsq23);
+            rinv31           = gmx_fjsp_invsqrt_v2r8(rsq31);
+            rinv32           = gmx_fjsp_invsqrt_v2r8(rsq32);
+            rinv33           = gmx_fjsp_invsqrt_v2r8(rsq33);
+
+            rinvsq00         = gmx_fjsp_inv_v2r8(rsq00);
+            rinvsq11         = _fjsp_mul_v2r8(rinv11,rinv11);
+            rinvsq12         = _fjsp_mul_v2r8(rinv12,rinv12);
+            rinvsq13         = _fjsp_mul_v2r8(rinv13,rinv13);
+            rinvsq21         = _fjsp_mul_v2r8(rinv21,rinv21);
+            rinvsq22         = _fjsp_mul_v2r8(rinv22,rinv22);
+            rinvsq23         = _fjsp_mul_v2r8(rinv23,rinv23);
+            rinvsq31         = _fjsp_mul_v2r8(rinv31,rinv31);
+            rinvsq32         = _fjsp_mul_v2r8(rinv32,rinv32);
+            rinvsq33         = _fjsp_mul_v2r8(rinv33,rinv33);
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+            fjx1             = _fjsp_setzero_v2r8();
+            fjy1             = _fjsp_setzero_v2r8();
+            fjz1             = _fjsp_setzero_v2r8();
+            fjx2             = _fjsp_setzero_v2r8();
+            fjy2             = _fjsp_setzero_v2r8();
+            fjz2             = _fjsp_setzero_v2r8();
+            fjx3             = _fjsp_setzero_v2r8();
+            fjy3             = _fjsp_setzero_v2r8();
+            fjz3             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+            fvdw             = _fjsp_mul_v2r8(_fjsp_msub_v2r8(c12_00,rinvsix,c6_00),_fjsp_mul_v2r8(rinvsix,rinvsq00));
+
+            fscal            = fvdw;
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq11,rinv11);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq11);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+            
+            fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq12,rinv12);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq12);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+            
+            fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq13,rinv13);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq13);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx13,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy13,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz13,fscal,fiz1);
+            
+            fjx3             = _fjsp_madd_v2r8(dx13,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy13,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz13,fscal,fjz3);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq21,rinv21);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq21);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+            
+            fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq22,rinv22);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq22);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+            
+            fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq23,rinv23);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq23);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx23,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy23,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz23,fscal,fiz2);
+            
+            fjx3             = _fjsp_madd_v2r8(dx23,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy23,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz23,fscal,fjz3);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq31,rinv31);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq31);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx31,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy31,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz31,fscal,fiz3);
+            
+            fjx1             = _fjsp_madd_v2r8(dx31,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy31,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz31,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq32,rinv32);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq32);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx32,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy32,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz32,fscal,fiz3);
+            
+            fjx2             = _fjsp_madd_v2r8(dx32,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy32,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz32,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq33,rinv33);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq33);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx33,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy33,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz33,fscal,fiz3);
+            
+            fjx3             = _fjsp_madd_v2r8(dx33,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy33,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz33,fscal,fjz3);
+
+            gmx_fjsp_decrement_4rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+
+            /* Inner loop uses 303 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_4rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
+                                              &jy2,&jz2,&jx3,&jy3,&jz3);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx11             = _fjsp_sub_v2r8(ix1,jx1);
+            dy11             = _fjsp_sub_v2r8(iy1,jy1);
+            dz11             = _fjsp_sub_v2r8(iz1,jz1);
+            dx12             = _fjsp_sub_v2r8(ix1,jx2);
+            dy12             = _fjsp_sub_v2r8(iy1,jy2);
+            dz12             = _fjsp_sub_v2r8(iz1,jz2);
+            dx13             = _fjsp_sub_v2r8(ix1,jx3);
+            dy13             = _fjsp_sub_v2r8(iy1,jy3);
+            dz13             = _fjsp_sub_v2r8(iz1,jz3);
+            dx21             = _fjsp_sub_v2r8(ix2,jx1);
+            dy21             = _fjsp_sub_v2r8(iy2,jy1);
+            dz21             = _fjsp_sub_v2r8(iz2,jz1);
+            dx22             = _fjsp_sub_v2r8(ix2,jx2);
+            dy22             = _fjsp_sub_v2r8(iy2,jy2);
+            dz22             = _fjsp_sub_v2r8(iz2,jz2);
+            dx23             = _fjsp_sub_v2r8(ix2,jx3);
+            dy23             = _fjsp_sub_v2r8(iy2,jy3);
+            dz23             = _fjsp_sub_v2r8(iz2,jz3);
+            dx31             = _fjsp_sub_v2r8(ix3,jx1);
+            dy31             = _fjsp_sub_v2r8(iy3,jy1);
+            dz31             = _fjsp_sub_v2r8(iz3,jz1);
+            dx32             = _fjsp_sub_v2r8(ix3,jx2);
+            dy32             = _fjsp_sub_v2r8(iy3,jy2);
+            dz32             = _fjsp_sub_v2r8(iz3,jz2);
+            dx33             = _fjsp_sub_v2r8(ix3,jx3);
+            dy33             = _fjsp_sub_v2r8(iy3,jy3);
+            dz33             = _fjsp_sub_v2r8(iz3,jz3);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+            rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+            rsq13            = gmx_fjsp_calc_rsq_v2r8(dx13,dy13,dz13);
+            rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+            rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+            rsq23            = gmx_fjsp_calc_rsq_v2r8(dx23,dy23,dz23);
+            rsq31            = gmx_fjsp_calc_rsq_v2r8(dx31,dy31,dz31);
+            rsq32            = gmx_fjsp_calc_rsq_v2r8(dx32,dy32,dz32);
+            rsq33            = gmx_fjsp_calc_rsq_v2r8(dx33,dy33,dz33);
+
+            rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+            rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+            rinv13           = gmx_fjsp_invsqrt_v2r8(rsq13);
+            rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+            rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+            rinv23           = gmx_fjsp_invsqrt_v2r8(rsq23);
+            rinv31           = gmx_fjsp_invsqrt_v2r8(rsq31);
+            rinv32           = gmx_fjsp_invsqrt_v2r8(rsq32);
+            rinv33           = gmx_fjsp_invsqrt_v2r8(rsq33);
+
+            rinvsq00         = gmx_fjsp_inv_v2r8(rsq00);
+            rinvsq11         = _fjsp_mul_v2r8(rinv11,rinv11);
+            rinvsq12         = _fjsp_mul_v2r8(rinv12,rinv12);
+            rinvsq13         = _fjsp_mul_v2r8(rinv13,rinv13);
+            rinvsq21         = _fjsp_mul_v2r8(rinv21,rinv21);
+            rinvsq22         = _fjsp_mul_v2r8(rinv22,rinv22);
+            rinvsq23         = _fjsp_mul_v2r8(rinv23,rinv23);
+            rinvsq31         = _fjsp_mul_v2r8(rinv31,rinv31);
+            rinvsq32         = _fjsp_mul_v2r8(rinv32,rinv32);
+            rinvsq33         = _fjsp_mul_v2r8(rinv33,rinv33);
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+            fjx1             = _fjsp_setzero_v2r8();
+            fjy1             = _fjsp_setzero_v2r8();
+            fjz1             = _fjsp_setzero_v2r8();
+            fjx2             = _fjsp_setzero_v2r8();
+            fjy2             = _fjsp_setzero_v2r8();
+            fjz2             = _fjsp_setzero_v2r8();
+            fjx3             = _fjsp_setzero_v2r8();
+            fjy3             = _fjsp_setzero_v2r8();
+            fjz3             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+            fvdw             = _fjsp_mul_v2r8(_fjsp_msub_v2r8(c12_00,rinvsix,c6_00),_fjsp_mul_v2r8(rinvsix,rinvsq00));
+
+            fscal            = fvdw;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq11,rinv11);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq11);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+            
+            fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq12,rinv12);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq12);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+            
+            fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq13,rinv13);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq13);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx13,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy13,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz13,fscal,fiz1);
+            
+            fjx3             = _fjsp_madd_v2r8(dx13,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy13,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz13,fscal,fjz3);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq21,rinv21);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq21);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+            
+            fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq22,rinv22);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq22);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+            
+            fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq23,rinv23);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq23);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx23,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy23,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz23,fscal,fiz2);
+            
+            fjx3             = _fjsp_madd_v2r8(dx23,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy23,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz23,fscal,fjz3);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq31,rinv31);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq31);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx31,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy31,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz31,fscal,fiz3);
+            
+            fjx1             = _fjsp_madd_v2r8(dx31,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy31,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz31,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq32,rinv32);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq32);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx32,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy32,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz32,fscal,fiz3);
+            
+            fjx2             = _fjsp_madd_v2r8(dx32,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy32,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz32,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq33,rinv33);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq33);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx33,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy33,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz33,fscal,fiz3);
+            
+            fjx3             = _fjsp_madd_v2r8(dx33,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy33,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz33,fscal,fjz3);
+
+            gmx_fjsp_decrement_4rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+
+            /* Inner loop uses 303 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_fjsp_update_iforce_4atom_swizzle_v2r8(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 24 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_F,outeriter*24 + inneriter*303);
+}
diff --git a/src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecCoul_VdwNone_GeomP1P1_sparc64_hpc_ace_double.c b/src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecCoul_VdwNone_GeomP1P1_sparc64_hpc_ace_double.c
new file mode 100644 (file)
index 0000000..d3b90c8
--- /dev/null
@@ -0,0 +1,482 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2012, by the GROMACS development team, led by
+ * David van der Spoel, Berk Hess, Erik Lindahl, and including many
+ * others, as listed in the AUTHORS file in the top-level source
+ * directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*
+ * Note: this file was generated by the GROMACS sparc64_hpc_ace_double kernel generator.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "kernelutil_sparc64_hpc_ace_double.h"
+
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecCoul_VdwNone_GeomP1P1_VF_sparc64_hpc_ace_double
+ * Electrostatics interaction: Coulomb
+ * VdW interaction:            None
+ * Geometry:                   Particle-Particle
+ * Calculate force/pot:        PotentialAndForce
+ */
+void
+nb_kernel_ElecCoul_VdwNone_GeomP1P1_VF_sparc64_hpc_ace_double
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwjidx0A,vdwjidx0B;
+    _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    _fjsp_v2r8       itab_tmp;
+    _fjsp_v2r8       dummy_mask,cutoff_mask;
+    _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+    _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+    union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+    charge           = mdatoms->chargeA;
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_fjsp_load_shift_and_1rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,&ix0,&iy0,&iz0);
+
+        fix0             = _fjsp_setzero_v2r8();
+        fiy0             = _fjsp_setzero_v2r8();
+        fiz0             = _fjsp_setzero_v2r8();
+
+        /* Load parameters for i particles */
+        iq0              = _fjsp_mul_v2r8(facel,gmx_fjsp_load1_v2r8(charge+inr+0));
+
+        /* Reset potential sums */
+        velecsum         = _fjsp_setzero_v2r8();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_fjsp_load_2real_swizzle_v2r8(charge+jnrA+0,charge+jnrB+0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _fjsp_mul_v2r8(iq0,jq0);
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq00,rinv00);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq00);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            gmx_fjsp_decrement_fma_1rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fscal,dx00,dy00,dz00);
+
+            /* Inner loop uses 31 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+
+            /* Load parameters for j particles */
+            jq0              = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),charge+jnrA+0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _fjsp_mul_v2r8(iq0,jq0);
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq00,rinv00);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq00);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            gmx_fjsp_decrement_fma_1rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fscal,dx00,dy00,dz00);
+
+            /* Inner loop uses 31 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_fjsp_update_iforce_1atom_swizzle_v2r8(fix0,fiy0,fiz0,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        ggid                        = gid[iidx];
+        /* Update potential energies */
+        gmx_fjsp_update_1pot_v2r8(velecsum,kernel_data->energygrp_elec+ggid);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 8 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VF,outeriter*8 + inneriter*31);
+}
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecCoul_VdwNone_GeomP1P1_F_sparc64_hpc_ace_double
+ * Electrostatics interaction: Coulomb
+ * VdW interaction:            None
+ * Geometry:                   Particle-Particle
+ * Calculate force/pot:        Force
+ */
+void
+nb_kernel_ElecCoul_VdwNone_GeomP1P1_F_sparc64_hpc_ace_double
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwjidx0A,vdwjidx0B;
+    _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    _fjsp_v2r8       itab_tmp;
+    _fjsp_v2r8       dummy_mask,cutoff_mask;
+    _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+    _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+    union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+    charge           = mdatoms->chargeA;
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_fjsp_load_shift_and_1rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,&ix0,&iy0,&iz0);
+
+        fix0             = _fjsp_setzero_v2r8();
+        fiy0             = _fjsp_setzero_v2r8();
+        fiz0             = _fjsp_setzero_v2r8();
+
+        /* Load parameters for i particles */
+        iq0              = _fjsp_mul_v2r8(facel,gmx_fjsp_load1_v2r8(charge+inr+0));
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_fjsp_load_2real_swizzle_v2r8(charge+jnrA+0,charge+jnrB+0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _fjsp_mul_v2r8(iq0,jq0);
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq00,rinv00);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq00);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            gmx_fjsp_decrement_fma_1rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fscal,dx00,dy00,dz00);
+
+            /* Inner loop uses 30 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+
+            /* Load parameters for j particles */
+            jq0              = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),charge+jnrA+0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _fjsp_mul_v2r8(iq0,jq0);
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq00,rinv00);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq00);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            gmx_fjsp_decrement_fma_1rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fscal,dx00,dy00,dz00);
+
+            /* Inner loop uses 30 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_fjsp_update_iforce_1atom_swizzle_v2r8(fix0,fiy0,fiz0,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 7 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_F,outeriter*7 + inneriter*30);
+}
diff --git a/src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecCoul_VdwNone_GeomW3P1_sparc64_hpc_ace_double.c b/src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecCoul_VdwNone_GeomW3P1_sparc64_hpc_ace_double.c
new file mode 100644 (file)
index 0000000..dbcbeab
--- /dev/null
@@ -0,0 +1,792 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2012, by the GROMACS development team, led by
+ * David van der Spoel, Berk Hess, Erik Lindahl, and including many
+ * others, as listed in the AUTHORS file in the top-level source
+ * directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*
+ * Note: this file was generated by the GROMACS sparc64_hpc_ace_double kernel generator.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "kernelutil_sparc64_hpc_ace_double.h"
+
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecCoul_VdwNone_GeomW3P1_VF_sparc64_hpc_ace_double
+ * Electrostatics interaction: Coulomb
+ * VdW interaction:            None
+ * Geometry:                   Water3-Particle
+ * Calculate force/pot:        PotentialAndForce
+ */
+void
+nb_kernel_ElecCoul_VdwNone_GeomW3P1_VF_sparc64_hpc_ace_double
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwjidx0A,vdwjidx0B;
+    _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    _fjsp_v2r8       dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    _fjsp_v2r8       dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    _fjsp_v2r8       itab_tmp;
+    _fjsp_v2r8       dummy_mask,cutoff_mask;
+    _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+    _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+    union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+    charge           = mdatoms->chargeA;
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq0              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+0]));
+    iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+    iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_fjsp_load_shift_and_3rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
+
+        fix0             = _fjsp_setzero_v2r8();
+        fiy0             = _fjsp_setzero_v2r8();
+        fiz0             = _fjsp_setzero_v2r8();
+        fix1             = _fjsp_setzero_v2r8();
+        fiy1             = _fjsp_setzero_v2r8();
+        fiz1             = _fjsp_setzero_v2r8();
+        fix2             = _fjsp_setzero_v2r8();
+        fiy2             = _fjsp_setzero_v2r8();
+        fiz2             = _fjsp_setzero_v2r8();
+
+        /* Reset potential sums */
+        velecsum         = _fjsp_setzero_v2r8();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+            rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+            rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_fjsp_load_2real_swizzle_v2r8(charge+jnrA+0,charge+jnrB+0);
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _fjsp_mul_v2r8(iq0,jq0);
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq00,rinv00);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq00);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _fjsp_mul_v2r8(iq1,jq0);
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq10,rinv10);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq10);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _fjsp_mul_v2r8(iq2,jq0);
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq20,rinv20);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq20);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            gmx_fjsp_decrement_1rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 96 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+            rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+            rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+
+            /* Load parameters for j particles */
+            jq0              = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),charge+jnrA+0);
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _fjsp_mul_v2r8(iq0,jq0);
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq00,rinv00);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq00);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _fjsp_mul_v2r8(iq1,jq0);
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq10,rinv10);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq10);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _fjsp_mul_v2r8(iq2,jq0);
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq20,rinv20);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq20);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            gmx_fjsp_decrement_1rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 96 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_fjsp_update_iforce_3atom_swizzle_v2r8(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        ggid                        = gid[iidx];
+        /* Update potential energies */
+        gmx_fjsp_update_1pot_v2r8(velecsum,kernel_data->energygrp_elec+ggid);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 19 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W3_VF,outeriter*19 + inneriter*96);
+}
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecCoul_VdwNone_GeomW3P1_F_sparc64_hpc_ace_double
+ * Electrostatics interaction: Coulomb
+ * VdW interaction:            None
+ * Geometry:                   Water3-Particle
+ * Calculate force/pot:        Force
+ */
+void
+nb_kernel_ElecCoul_VdwNone_GeomW3P1_F_sparc64_hpc_ace_double
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwjidx0A,vdwjidx0B;
+    _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    _fjsp_v2r8       dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    _fjsp_v2r8       dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    _fjsp_v2r8       itab_tmp;
+    _fjsp_v2r8       dummy_mask,cutoff_mask;
+    _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+    _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+    union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+    charge           = mdatoms->chargeA;
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq0              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+0]));
+    iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+    iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_fjsp_load_shift_and_3rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
+
+        fix0             = _fjsp_setzero_v2r8();
+        fiy0             = _fjsp_setzero_v2r8();
+        fiz0             = _fjsp_setzero_v2r8();
+        fix1             = _fjsp_setzero_v2r8();
+        fiy1             = _fjsp_setzero_v2r8();
+        fiz1             = _fjsp_setzero_v2r8();
+        fix2             = _fjsp_setzero_v2r8();
+        fiy2             = _fjsp_setzero_v2r8();
+        fiz2             = _fjsp_setzero_v2r8();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+            rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+            rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_fjsp_load_2real_swizzle_v2r8(charge+jnrA+0,charge+jnrB+0);
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _fjsp_mul_v2r8(iq0,jq0);
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq00,rinv00);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq00);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _fjsp_mul_v2r8(iq1,jq0);
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq10,rinv10);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq10);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _fjsp_mul_v2r8(iq2,jq0);
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq20,rinv20);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq20);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            gmx_fjsp_decrement_1rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 93 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+            rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+            rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+
+            /* Load parameters for j particles */
+            jq0              = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),charge+jnrA+0);
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _fjsp_mul_v2r8(iq0,jq0);
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq00,rinv00);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq00);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _fjsp_mul_v2r8(iq1,jq0);
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq10,rinv10);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq10);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _fjsp_mul_v2r8(iq2,jq0);
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq20,rinv20);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq20);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            gmx_fjsp_decrement_1rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 93 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_fjsp_update_iforce_3atom_swizzle_v2r8(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 18 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W3_F,outeriter*18 + inneriter*93);
+}
diff --git a/src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecCoul_VdwNone_GeomW3W3_sparc64_hpc_ace_double.c b/src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecCoul_VdwNone_GeomW3W3_sparc64_hpc_ace_double.c
new file mode 100644 (file)
index 0000000..55fe116
--- /dev/null
@@ -0,0 +1,1480 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2012, by the GROMACS development team, led by
+ * David van der Spoel, Berk Hess, Erik Lindahl, and including many
+ * others, as listed in the AUTHORS file in the top-level source
+ * directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*
+ * Note: this file was generated by the GROMACS sparc64_hpc_ace_double kernel generator.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "kernelutil_sparc64_hpc_ace_double.h"
+
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecCoul_VdwNone_GeomW3W3_VF_sparc64_hpc_ace_double
+ * Electrostatics interaction: Coulomb
+ * VdW interaction:            None
+ * Geometry:                   Water3-Water3
+ * Calculate force/pot:        PotentialAndForce
+ */
+void
+nb_kernel_ElecCoul_VdwNone_GeomW3W3_VF_sparc64_hpc_ace_double
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwjidx0A,vdwjidx0B;
+    _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    int              vdwjidx1A,vdwjidx1B;
+    _fjsp_v2r8       jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+    int              vdwjidx2A,vdwjidx2B;
+    _fjsp_v2r8       jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+    _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    _fjsp_v2r8       dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
+    _fjsp_v2r8       dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
+    _fjsp_v2r8       dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    _fjsp_v2r8       dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+    _fjsp_v2r8       dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+    _fjsp_v2r8       dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    _fjsp_v2r8       dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+    _fjsp_v2r8       dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+    _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    _fjsp_v2r8       itab_tmp;
+    _fjsp_v2r8       dummy_mask,cutoff_mask;
+    _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+    _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+    union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+    charge           = mdatoms->chargeA;
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq0              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+0]));
+    iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+    iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+
+    jq0              = gmx_fjsp_set1_v2r8(charge[inr+0]);
+    jq1              = gmx_fjsp_set1_v2r8(charge[inr+1]);
+    jq2              = gmx_fjsp_set1_v2r8(charge[inr+2]);
+    qq00             = _fjsp_mul_v2r8(iq0,jq0);
+    qq01             = _fjsp_mul_v2r8(iq0,jq1);
+    qq02             = _fjsp_mul_v2r8(iq0,jq2);
+    qq10             = _fjsp_mul_v2r8(iq1,jq0);
+    qq11             = _fjsp_mul_v2r8(iq1,jq1);
+    qq12             = _fjsp_mul_v2r8(iq1,jq2);
+    qq20             = _fjsp_mul_v2r8(iq2,jq0);
+    qq21             = _fjsp_mul_v2r8(iq2,jq1);
+    qq22             = _fjsp_mul_v2r8(iq2,jq2);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_fjsp_load_shift_and_3rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
+
+        fix0             = _fjsp_setzero_v2r8();
+        fiy0             = _fjsp_setzero_v2r8();
+        fiz0             = _fjsp_setzero_v2r8();
+        fix1             = _fjsp_setzero_v2r8();
+        fiy1             = _fjsp_setzero_v2r8();
+        fiz1             = _fjsp_setzero_v2r8();
+        fix2             = _fjsp_setzero_v2r8();
+        fiy2             = _fjsp_setzero_v2r8();
+        fiz2             = _fjsp_setzero_v2r8();
+
+        /* Reset potential sums */
+        velecsum         = _fjsp_setzero_v2r8();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_3rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx01             = _fjsp_sub_v2r8(ix0,jx1);
+            dy01             = _fjsp_sub_v2r8(iy0,jy1);
+            dz01             = _fjsp_sub_v2r8(iz0,jz1);
+            dx02             = _fjsp_sub_v2r8(ix0,jx2);
+            dy02             = _fjsp_sub_v2r8(iy0,jy2);
+            dz02             = _fjsp_sub_v2r8(iz0,jz2);
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx11             = _fjsp_sub_v2r8(ix1,jx1);
+            dy11             = _fjsp_sub_v2r8(iy1,jy1);
+            dz11             = _fjsp_sub_v2r8(iz1,jz1);
+            dx12             = _fjsp_sub_v2r8(ix1,jx2);
+            dy12             = _fjsp_sub_v2r8(iy1,jy2);
+            dz12             = _fjsp_sub_v2r8(iz1,jz2);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+            dx21             = _fjsp_sub_v2r8(ix2,jx1);
+            dy21             = _fjsp_sub_v2r8(iy2,jy1);
+            dz21             = _fjsp_sub_v2r8(iz2,jz1);
+            dx22             = _fjsp_sub_v2r8(ix2,jx2);
+            dy22             = _fjsp_sub_v2r8(iy2,jy2);
+            dz22             = _fjsp_sub_v2r8(iz2,jz2);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq01            = gmx_fjsp_calc_rsq_v2r8(dx01,dy01,dz01);
+            rsq02            = gmx_fjsp_calc_rsq_v2r8(dx02,dy02,dz02);
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+            rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+            rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+            rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv01           = gmx_fjsp_invsqrt_v2r8(rsq01);
+            rinv02           = gmx_fjsp_invsqrt_v2r8(rsq02);
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+            rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+            rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+            rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+            rinvsq01         = _fjsp_mul_v2r8(rinv01,rinv01);
+            rinvsq02         = _fjsp_mul_v2r8(rinv02,rinv02);
+            rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+            rinvsq11         = _fjsp_mul_v2r8(rinv11,rinv11);
+            rinvsq12         = _fjsp_mul_v2r8(rinv12,rinv12);
+            rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+            rinvsq21         = _fjsp_mul_v2r8(rinv21,rinv21);
+            rinvsq22         = _fjsp_mul_v2r8(rinv22,rinv22);
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+            fjx1             = _fjsp_setzero_v2r8();
+            fjy1             = _fjsp_setzero_v2r8();
+            fjz1             = _fjsp_setzero_v2r8();
+            fjx2             = _fjsp_setzero_v2r8();
+            fjy2             = _fjsp_setzero_v2r8();
+            fjz2             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq00,rinv00);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq00);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq01,rinv01);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq01);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx01,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy01,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz01,fscal,fiz0);
+            
+            fjx1             = _fjsp_madd_v2r8(dx01,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy01,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz01,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq02,rinv02);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq02);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx02,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy02,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz02,fscal,fiz0);
+            
+            fjx2             = _fjsp_madd_v2r8(dx02,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy02,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz02,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq10,rinv10);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq10);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq11,rinv11);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq11);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+            
+            fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq12,rinv12);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq12);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+            
+            fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq20,rinv20);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq20);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq21,rinv21);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq21);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+            
+            fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq22,rinv22);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq22);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+            
+            fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+
+            gmx_fjsp_decrement_3rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+
+            /* Inner loop uses 279 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_3rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx01             = _fjsp_sub_v2r8(ix0,jx1);
+            dy01             = _fjsp_sub_v2r8(iy0,jy1);
+            dz01             = _fjsp_sub_v2r8(iz0,jz1);
+            dx02             = _fjsp_sub_v2r8(ix0,jx2);
+            dy02             = _fjsp_sub_v2r8(iy0,jy2);
+            dz02             = _fjsp_sub_v2r8(iz0,jz2);
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx11             = _fjsp_sub_v2r8(ix1,jx1);
+            dy11             = _fjsp_sub_v2r8(iy1,jy1);
+            dz11             = _fjsp_sub_v2r8(iz1,jz1);
+            dx12             = _fjsp_sub_v2r8(ix1,jx2);
+            dy12             = _fjsp_sub_v2r8(iy1,jy2);
+            dz12             = _fjsp_sub_v2r8(iz1,jz2);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+            dx21             = _fjsp_sub_v2r8(ix2,jx1);
+            dy21             = _fjsp_sub_v2r8(iy2,jy1);
+            dz21             = _fjsp_sub_v2r8(iz2,jz1);
+            dx22             = _fjsp_sub_v2r8(ix2,jx2);
+            dy22             = _fjsp_sub_v2r8(iy2,jy2);
+            dz22             = _fjsp_sub_v2r8(iz2,jz2);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq01            = gmx_fjsp_calc_rsq_v2r8(dx01,dy01,dz01);
+            rsq02            = gmx_fjsp_calc_rsq_v2r8(dx02,dy02,dz02);
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+            rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+            rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+            rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv01           = gmx_fjsp_invsqrt_v2r8(rsq01);
+            rinv02           = gmx_fjsp_invsqrt_v2r8(rsq02);
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+            rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+            rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+            rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+            rinvsq01         = _fjsp_mul_v2r8(rinv01,rinv01);
+            rinvsq02         = _fjsp_mul_v2r8(rinv02,rinv02);
+            rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+            rinvsq11         = _fjsp_mul_v2r8(rinv11,rinv11);
+            rinvsq12         = _fjsp_mul_v2r8(rinv12,rinv12);
+            rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+            rinvsq21         = _fjsp_mul_v2r8(rinv21,rinv21);
+            rinvsq22         = _fjsp_mul_v2r8(rinv22,rinv22);
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+            fjx1             = _fjsp_setzero_v2r8();
+            fjy1             = _fjsp_setzero_v2r8();
+            fjz1             = _fjsp_setzero_v2r8();
+            fjx2             = _fjsp_setzero_v2r8();
+            fjy2             = _fjsp_setzero_v2r8();
+            fjz2             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq00,rinv00);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq00);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq01,rinv01);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq01);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx01,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy01,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz01,fscal,fiz0);
+            
+            fjx1             = _fjsp_madd_v2r8(dx01,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy01,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz01,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq02,rinv02);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq02);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx02,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy02,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz02,fscal,fiz0);
+            
+            fjx2             = _fjsp_madd_v2r8(dx02,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy02,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz02,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq10,rinv10);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq10);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq11,rinv11);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq11);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+            
+            fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq12,rinv12);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq12);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+            
+            fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq20,rinv20);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq20);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq21,rinv21);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq21);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+            
+            fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq22,rinv22);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq22);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+            
+            fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+
+            gmx_fjsp_decrement_3rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+
+            /* Inner loop uses 279 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_fjsp_update_iforce_3atom_swizzle_v2r8(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        ggid                        = gid[iidx];
+        /* Update potential energies */
+        gmx_fjsp_update_1pot_v2r8(velecsum,kernel_data->energygrp_elec+ggid);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 19 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W3W3_VF,outeriter*19 + inneriter*279);
+}
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecCoul_VdwNone_GeomW3W3_F_sparc64_hpc_ace_double
+ * Electrostatics interaction: Coulomb
+ * VdW interaction:            None
+ * Geometry:                   Water3-Water3
+ * Calculate force/pot:        Force
+ */
+void
+nb_kernel_ElecCoul_VdwNone_GeomW3W3_F_sparc64_hpc_ace_double
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwjidx0A,vdwjidx0B;
+    _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    int              vdwjidx1A,vdwjidx1B;
+    _fjsp_v2r8       jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+    int              vdwjidx2A,vdwjidx2B;
+    _fjsp_v2r8       jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+    _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    _fjsp_v2r8       dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
+    _fjsp_v2r8       dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
+    _fjsp_v2r8       dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    _fjsp_v2r8       dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+    _fjsp_v2r8       dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+    _fjsp_v2r8       dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    _fjsp_v2r8       dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+    _fjsp_v2r8       dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+    _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    _fjsp_v2r8       itab_tmp;
+    _fjsp_v2r8       dummy_mask,cutoff_mask;
+    _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+    _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+    union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+    charge           = mdatoms->chargeA;
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq0              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+0]));
+    iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+    iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+
+    jq0              = gmx_fjsp_set1_v2r8(charge[inr+0]);
+    jq1              = gmx_fjsp_set1_v2r8(charge[inr+1]);
+    jq2              = gmx_fjsp_set1_v2r8(charge[inr+2]);
+    qq00             = _fjsp_mul_v2r8(iq0,jq0);
+    qq01             = _fjsp_mul_v2r8(iq0,jq1);
+    qq02             = _fjsp_mul_v2r8(iq0,jq2);
+    qq10             = _fjsp_mul_v2r8(iq1,jq0);
+    qq11             = _fjsp_mul_v2r8(iq1,jq1);
+    qq12             = _fjsp_mul_v2r8(iq1,jq2);
+    qq20             = _fjsp_mul_v2r8(iq2,jq0);
+    qq21             = _fjsp_mul_v2r8(iq2,jq1);
+    qq22             = _fjsp_mul_v2r8(iq2,jq2);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_fjsp_load_shift_and_3rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
+
+        fix0             = _fjsp_setzero_v2r8();
+        fiy0             = _fjsp_setzero_v2r8();
+        fiz0             = _fjsp_setzero_v2r8();
+        fix1             = _fjsp_setzero_v2r8();
+        fiy1             = _fjsp_setzero_v2r8();
+        fiz1             = _fjsp_setzero_v2r8();
+        fix2             = _fjsp_setzero_v2r8();
+        fiy2             = _fjsp_setzero_v2r8();
+        fiz2             = _fjsp_setzero_v2r8();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_3rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx01             = _fjsp_sub_v2r8(ix0,jx1);
+            dy01             = _fjsp_sub_v2r8(iy0,jy1);
+            dz01             = _fjsp_sub_v2r8(iz0,jz1);
+            dx02             = _fjsp_sub_v2r8(ix0,jx2);
+            dy02             = _fjsp_sub_v2r8(iy0,jy2);
+            dz02             = _fjsp_sub_v2r8(iz0,jz2);
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx11             = _fjsp_sub_v2r8(ix1,jx1);
+            dy11             = _fjsp_sub_v2r8(iy1,jy1);
+            dz11             = _fjsp_sub_v2r8(iz1,jz1);
+            dx12             = _fjsp_sub_v2r8(ix1,jx2);
+            dy12             = _fjsp_sub_v2r8(iy1,jy2);
+            dz12             = _fjsp_sub_v2r8(iz1,jz2);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+            dx21             = _fjsp_sub_v2r8(ix2,jx1);
+            dy21             = _fjsp_sub_v2r8(iy2,jy1);
+            dz21             = _fjsp_sub_v2r8(iz2,jz1);
+            dx22             = _fjsp_sub_v2r8(ix2,jx2);
+            dy22             = _fjsp_sub_v2r8(iy2,jy2);
+            dz22             = _fjsp_sub_v2r8(iz2,jz2);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq01            = gmx_fjsp_calc_rsq_v2r8(dx01,dy01,dz01);
+            rsq02            = gmx_fjsp_calc_rsq_v2r8(dx02,dy02,dz02);
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+            rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+            rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+            rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv01           = gmx_fjsp_invsqrt_v2r8(rsq01);
+            rinv02           = gmx_fjsp_invsqrt_v2r8(rsq02);
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+            rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+            rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+            rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+            rinvsq01         = _fjsp_mul_v2r8(rinv01,rinv01);
+            rinvsq02         = _fjsp_mul_v2r8(rinv02,rinv02);
+            rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+            rinvsq11         = _fjsp_mul_v2r8(rinv11,rinv11);
+            rinvsq12         = _fjsp_mul_v2r8(rinv12,rinv12);
+            rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+            rinvsq21         = _fjsp_mul_v2r8(rinv21,rinv21);
+            rinvsq22         = _fjsp_mul_v2r8(rinv22,rinv22);
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+            fjx1             = _fjsp_setzero_v2r8();
+            fjy1             = _fjsp_setzero_v2r8();
+            fjz1             = _fjsp_setzero_v2r8();
+            fjx2             = _fjsp_setzero_v2r8();
+            fjy2             = _fjsp_setzero_v2r8();
+            fjz2             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq00,rinv00);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq00);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq01,rinv01);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq01);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx01,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy01,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz01,fscal,fiz0);
+            
+            fjx1             = _fjsp_madd_v2r8(dx01,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy01,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz01,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq02,rinv02);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq02);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx02,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy02,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz02,fscal,fiz0);
+            
+            fjx2             = _fjsp_madd_v2r8(dx02,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy02,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz02,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq10,rinv10);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq10);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq11,rinv11);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq11);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+            
+            fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq12,rinv12);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq12);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+            
+            fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq20,rinv20);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq20);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq21,rinv21);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq21);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+            
+            fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq22,rinv22);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq22);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+            
+            fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+
+            gmx_fjsp_decrement_3rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+
+            /* Inner loop uses 270 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_3rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx01             = _fjsp_sub_v2r8(ix0,jx1);
+            dy01             = _fjsp_sub_v2r8(iy0,jy1);
+            dz01             = _fjsp_sub_v2r8(iz0,jz1);
+            dx02             = _fjsp_sub_v2r8(ix0,jx2);
+            dy02             = _fjsp_sub_v2r8(iy0,jy2);
+            dz02             = _fjsp_sub_v2r8(iz0,jz2);
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx11             = _fjsp_sub_v2r8(ix1,jx1);
+            dy11             = _fjsp_sub_v2r8(iy1,jy1);
+            dz11             = _fjsp_sub_v2r8(iz1,jz1);
+            dx12             = _fjsp_sub_v2r8(ix1,jx2);
+            dy12             = _fjsp_sub_v2r8(iy1,jy2);
+            dz12             = _fjsp_sub_v2r8(iz1,jz2);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+            dx21             = _fjsp_sub_v2r8(ix2,jx1);
+            dy21             = _fjsp_sub_v2r8(iy2,jy1);
+            dz21             = _fjsp_sub_v2r8(iz2,jz1);
+            dx22             = _fjsp_sub_v2r8(ix2,jx2);
+            dy22             = _fjsp_sub_v2r8(iy2,jy2);
+            dz22             = _fjsp_sub_v2r8(iz2,jz2);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq01            = gmx_fjsp_calc_rsq_v2r8(dx01,dy01,dz01);
+            rsq02            = gmx_fjsp_calc_rsq_v2r8(dx02,dy02,dz02);
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+            rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+            rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+            rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv01           = gmx_fjsp_invsqrt_v2r8(rsq01);
+            rinv02           = gmx_fjsp_invsqrt_v2r8(rsq02);
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+            rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+            rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+            rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+            rinvsq01         = _fjsp_mul_v2r8(rinv01,rinv01);
+            rinvsq02         = _fjsp_mul_v2r8(rinv02,rinv02);
+            rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+            rinvsq11         = _fjsp_mul_v2r8(rinv11,rinv11);
+            rinvsq12         = _fjsp_mul_v2r8(rinv12,rinv12);
+            rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+            rinvsq21         = _fjsp_mul_v2r8(rinv21,rinv21);
+            rinvsq22         = _fjsp_mul_v2r8(rinv22,rinv22);
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+            fjx1             = _fjsp_setzero_v2r8();
+            fjy1             = _fjsp_setzero_v2r8();
+            fjz1             = _fjsp_setzero_v2r8();
+            fjx2             = _fjsp_setzero_v2r8();
+            fjy2             = _fjsp_setzero_v2r8();
+            fjz2             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq00,rinv00);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq00);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq01,rinv01);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq01);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx01,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy01,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz01,fscal,fiz0);
+            
+            fjx1             = _fjsp_madd_v2r8(dx01,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy01,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz01,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq02,rinv02);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq02);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx02,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy02,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz02,fscal,fiz0);
+            
+            fjx2             = _fjsp_madd_v2r8(dx02,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy02,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz02,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq10,rinv10);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq10);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq11,rinv11);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq11);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+            
+            fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq12,rinv12);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq12);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+            
+            fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq20,rinv20);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq20);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq21,rinv21);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq21);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+            
+            fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq22,rinv22);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq22);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+            
+            fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+
+            gmx_fjsp_decrement_3rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+
+            /* Inner loop uses 270 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_fjsp_update_iforce_3atom_swizzle_v2r8(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 18 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W3W3_F,outeriter*18 + inneriter*270);
+}
diff --git a/src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecCoul_VdwNone_GeomW4P1_sparc64_hpc_ace_double.c b/src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecCoul_VdwNone_GeomW4P1_sparc64_hpc_ace_double.c
new file mode 100644 (file)
index 0000000..faa314c
--- /dev/null
@@ -0,0 +1,792 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2012, by the GROMACS development team, led by
+ * David van der Spoel, Berk Hess, Erik Lindahl, and including many
+ * others, as listed in the AUTHORS file in the top-level source
+ * directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*
+ * Note: this file was generated by the GROMACS sparc64_hpc_ace_double kernel generator.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "kernelutil_sparc64_hpc_ace_double.h"
+
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecCoul_VdwNone_GeomW4P1_VF_sparc64_hpc_ace_double
+ * Electrostatics interaction: Coulomb
+ * VdW interaction:            None
+ * Geometry:                   Water4-Particle
+ * Calculate force/pot:        PotentialAndForce
+ */
+void
+nb_kernel_ElecCoul_VdwNone_GeomW4P1_VF_sparc64_hpc_ace_double
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset1;
+    _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwioffset3;
+    _fjsp_v2r8       ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+    int              vdwjidx0A,vdwjidx0B;
+    _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    _fjsp_v2r8       dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    _fjsp_v2r8       dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    _fjsp_v2r8       dx30,dy30,dz30,rsq30,rinv30,rinvsq30,r30,qq30,c6_30,c12_30;
+    _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    _fjsp_v2r8       itab_tmp;
+    _fjsp_v2r8       dummy_mask,cutoff_mask;
+    _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+    _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+    union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+    charge           = mdatoms->chargeA;
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+    iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+    iq3              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+3]));
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_fjsp_load_shift_and_3rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset+DIM,
+                                                 &ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
+
+        fix1             = _fjsp_setzero_v2r8();
+        fiy1             = _fjsp_setzero_v2r8();
+        fiz1             = _fjsp_setzero_v2r8();
+        fix2             = _fjsp_setzero_v2r8();
+        fiy2             = _fjsp_setzero_v2r8();
+        fiz2             = _fjsp_setzero_v2r8();
+        fix3             = _fjsp_setzero_v2r8();
+        fiy3             = _fjsp_setzero_v2r8();
+        fiz3             = _fjsp_setzero_v2r8();
+
+        /* Reset potential sums */
+        velecsum         = _fjsp_setzero_v2r8();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+            dx30             = _fjsp_sub_v2r8(ix3,jx0);
+            dy30             = _fjsp_sub_v2r8(iy3,jy0);
+            dz30             = _fjsp_sub_v2r8(iz3,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+            rsq30            = gmx_fjsp_calc_rsq_v2r8(dx30,dy30,dz30);
+
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+            rinv30           = gmx_fjsp_invsqrt_v2r8(rsq30);
+
+            rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+            rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+            rinvsq30         = _fjsp_mul_v2r8(rinv30,rinv30);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_fjsp_load_2real_swizzle_v2r8(charge+jnrA+0,charge+jnrB+0);
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _fjsp_mul_v2r8(iq1,jq0);
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq10,rinv10);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq10);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _fjsp_mul_v2r8(iq2,jq0);
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq20,rinv20);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq20);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq30             = _fjsp_mul_v2r8(iq3,jq0);
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq30,rinv30);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq30);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx30,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy30,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz30,fscal,fiz3);
+            
+            fjx0             = _fjsp_madd_v2r8(dx30,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy30,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz30,fscal,fjz0);
+
+            gmx_fjsp_decrement_1rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 96 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+            dx30             = _fjsp_sub_v2r8(ix3,jx0);
+            dy30             = _fjsp_sub_v2r8(iy3,jy0);
+            dz30             = _fjsp_sub_v2r8(iz3,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+            rsq30            = gmx_fjsp_calc_rsq_v2r8(dx30,dy30,dz30);
+
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+            rinv30           = gmx_fjsp_invsqrt_v2r8(rsq30);
+
+            rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+            rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+            rinvsq30         = _fjsp_mul_v2r8(rinv30,rinv30);
+
+            /* Load parameters for j particles */
+            jq0              = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),charge+jnrA+0);
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _fjsp_mul_v2r8(iq1,jq0);
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq10,rinv10);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq10);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _fjsp_mul_v2r8(iq2,jq0);
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq20,rinv20);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq20);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq30             = _fjsp_mul_v2r8(iq3,jq0);
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq30,rinv30);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq30);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx30,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy30,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz30,fscal,fiz3);
+            
+            fjx0             = _fjsp_madd_v2r8(dx30,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy30,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz30,fscal,fjz0);
+
+            gmx_fjsp_decrement_1rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 96 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_fjsp_update_iforce_3atom_swizzle_v2r8(fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+                                              f+i_coord_offset+DIM,fshift+i_shift_offset);
+
+        ggid                        = gid[iidx];
+        /* Update potential energies */
+        gmx_fjsp_update_1pot_v2r8(velecsum,kernel_data->energygrp_elec+ggid);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 19 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W4_VF,outeriter*19 + inneriter*96);
+}
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecCoul_VdwNone_GeomW4P1_F_sparc64_hpc_ace_double
+ * Electrostatics interaction: Coulomb
+ * VdW interaction:            None
+ * Geometry:                   Water4-Particle
+ * Calculate force/pot:        Force
+ */
+void
+nb_kernel_ElecCoul_VdwNone_GeomW4P1_F_sparc64_hpc_ace_double
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset1;
+    _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwioffset3;
+    _fjsp_v2r8       ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+    int              vdwjidx0A,vdwjidx0B;
+    _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    _fjsp_v2r8       dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    _fjsp_v2r8       dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    _fjsp_v2r8       dx30,dy30,dz30,rsq30,rinv30,rinvsq30,r30,qq30,c6_30,c12_30;
+    _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    _fjsp_v2r8       itab_tmp;
+    _fjsp_v2r8       dummy_mask,cutoff_mask;
+    _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+    _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+    union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+    charge           = mdatoms->chargeA;
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+    iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+    iq3              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+3]));
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_fjsp_load_shift_and_3rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset+DIM,
+                                                 &ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
+
+        fix1             = _fjsp_setzero_v2r8();
+        fiy1             = _fjsp_setzero_v2r8();
+        fiz1             = _fjsp_setzero_v2r8();
+        fix2             = _fjsp_setzero_v2r8();
+        fiy2             = _fjsp_setzero_v2r8();
+        fiz2             = _fjsp_setzero_v2r8();
+        fix3             = _fjsp_setzero_v2r8();
+        fiy3             = _fjsp_setzero_v2r8();
+        fiz3             = _fjsp_setzero_v2r8();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+            dx30             = _fjsp_sub_v2r8(ix3,jx0);
+            dy30             = _fjsp_sub_v2r8(iy3,jy0);
+            dz30             = _fjsp_sub_v2r8(iz3,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+            rsq30            = gmx_fjsp_calc_rsq_v2r8(dx30,dy30,dz30);
+
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+            rinv30           = gmx_fjsp_invsqrt_v2r8(rsq30);
+
+            rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+            rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+            rinvsq30         = _fjsp_mul_v2r8(rinv30,rinv30);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_fjsp_load_2real_swizzle_v2r8(charge+jnrA+0,charge+jnrB+0);
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _fjsp_mul_v2r8(iq1,jq0);
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq10,rinv10);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq10);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _fjsp_mul_v2r8(iq2,jq0);
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq20,rinv20);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq20);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq30             = _fjsp_mul_v2r8(iq3,jq0);
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq30,rinv30);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq30);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx30,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy30,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz30,fscal,fiz3);
+            
+            fjx0             = _fjsp_madd_v2r8(dx30,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy30,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz30,fscal,fjz0);
+
+            gmx_fjsp_decrement_1rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 93 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+            dx30             = _fjsp_sub_v2r8(ix3,jx0);
+            dy30             = _fjsp_sub_v2r8(iy3,jy0);
+            dz30             = _fjsp_sub_v2r8(iz3,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+            rsq30            = gmx_fjsp_calc_rsq_v2r8(dx30,dy30,dz30);
+
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+            rinv30           = gmx_fjsp_invsqrt_v2r8(rsq30);
+
+            rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+            rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+            rinvsq30         = _fjsp_mul_v2r8(rinv30,rinv30);
+
+            /* Load parameters for j particles */
+            jq0              = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),charge+jnrA+0);
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _fjsp_mul_v2r8(iq1,jq0);
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq10,rinv10);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq10);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _fjsp_mul_v2r8(iq2,jq0);
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq20,rinv20);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq20);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq30             = _fjsp_mul_v2r8(iq3,jq0);
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq30,rinv30);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq30);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx30,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy30,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz30,fscal,fiz3);
+            
+            fjx0             = _fjsp_madd_v2r8(dx30,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy30,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz30,fscal,fjz0);
+
+            gmx_fjsp_decrement_1rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 93 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_fjsp_update_iforce_3atom_swizzle_v2r8(fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+                                              f+i_coord_offset+DIM,fshift+i_shift_offset);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 18 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W4_F,outeriter*18 + inneriter*93);
+}
diff --git a/src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecCoul_VdwNone_GeomW4W4_sparc64_hpc_ace_double.c b/src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecCoul_VdwNone_GeomW4W4_sparc64_hpc_ace_double.c
new file mode 100644 (file)
index 0000000..51d8be8
--- /dev/null
@@ -0,0 +1,1480 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2012, by the GROMACS development team, led by
+ * David van der Spoel, Berk Hess, Erik Lindahl, and including many
+ * others, as listed in the AUTHORS file in the top-level source
+ * directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*
+ * Note: this file was generated by the GROMACS sparc64_hpc_ace_double kernel generator.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "kernelutil_sparc64_hpc_ace_double.h"
+
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecCoul_VdwNone_GeomW4W4_VF_sparc64_hpc_ace_double
+ * Electrostatics interaction: Coulomb
+ * VdW interaction:            None
+ * Geometry:                   Water4-Water4
+ * Calculate force/pot:        PotentialAndForce
+ */
+void
+nb_kernel_ElecCoul_VdwNone_GeomW4W4_VF_sparc64_hpc_ace_double
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset1;
+    _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwioffset3;
+    _fjsp_v2r8       ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+    int              vdwjidx1A,vdwjidx1B;
+    _fjsp_v2r8       jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+    int              vdwjidx2A,vdwjidx2B;
+    _fjsp_v2r8       jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+    int              vdwjidx3A,vdwjidx3B;
+    _fjsp_v2r8       jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
+    _fjsp_v2r8       dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+    _fjsp_v2r8       dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+    _fjsp_v2r8       dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
+    _fjsp_v2r8       dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+    _fjsp_v2r8       dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+    _fjsp_v2r8       dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
+    _fjsp_v2r8       dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
+    _fjsp_v2r8       dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
+    _fjsp_v2r8       dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
+    _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    _fjsp_v2r8       itab_tmp;
+    _fjsp_v2r8       dummy_mask,cutoff_mask;
+    _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+    _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+    union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+    charge           = mdatoms->chargeA;
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+    iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+    iq3              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+3]));
+
+    jq1              = gmx_fjsp_set1_v2r8(charge[inr+1]);
+    jq2              = gmx_fjsp_set1_v2r8(charge[inr+2]);
+    jq3              = gmx_fjsp_set1_v2r8(charge[inr+3]);
+    qq11             = _fjsp_mul_v2r8(iq1,jq1);
+    qq12             = _fjsp_mul_v2r8(iq1,jq2);
+    qq13             = _fjsp_mul_v2r8(iq1,jq3);
+    qq21             = _fjsp_mul_v2r8(iq2,jq1);
+    qq22             = _fjsp_mul_v2r8(iq2,jq2);
+    qq23             = _fjsp_mul_v2r8(iq2,jq3);
+    qq31             = _fjsp_mul_v2r8(iq3,jq1);
+    qq32             = _fjsp_mul_v2r8(iq3,jq2);
+    qq33             = _fjsp_mul_v2r8(iq3,jq3);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_fjsp_load_shift_and_3rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset+DIM,
+                                                 &ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
+
+        fix1             = _fjsp_setzero_v2r8();
+        fiy1             = _fjsp_setzero_v2r8();
+        fiz1             = _fjsp_setzero_v2r8();
+        fix2             = _fjsp_setzero_v2r8();
+        fiy2             = _fjsp_setzero_v2r8();
+        fiz2             = _fjsp_setzero_v2r8();
+        fix3             = _fjsp_setzero_v2r8();
+        fiy3             = _fjsp_setzero_v2r8();
+        fiz3             = _fjsp_setzero_v2r8();
+
+        /* Reset potential sums */
+        velecsum         = _fjsp_setzero_v2r8();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_3rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA+DIM,x+j_coord_offsetB+DIM,
+                                              &jx1,&jy1,&jz1,&jx2,&jy2,&jz2,&jx3,&jy3,&jz3);
+
+            /* Calculate displacement vector */
+            dx11             = _fjsp_sub_v2r8(ix1,jx1);
+            dy11             = _fjsp_sub_v2r8(iy1,jy1);
+            dz11             = _fjsp_sub_v2r8(iz1,jz1);
+            dx12             = _fjsp_sub_v2r8(ix1,jx2);
+            dy12             = _fjsp_sub_v2r8(iy1,jy2);
+            dz12             = _fjsp_sub_v2r8(iz1,jz2);
+            dx13             = _fjsp_sub_v2r8(ix1,jx3);
+            dy13             = _fjsp_sub_v2r8(iy1,jy3);
+            dz13             = _fjsp_sub_v2r8(iz1,jz3);
+            dx21             = _fjsp_sub_v2r8(ix2,jx1);
+            dy21             = _fjsp_sub_v2r8(iy2,jy1);
+            dz21             = _fjsp_sub_v2r8(iz2,jz1);
+            dx22             = _fjsp_sub_v2r8(ix2,jx2);
+            dy22             = _fjsp_sub_v2r8(iy2,jy2);
+            dz22             = _fjsp_sub_v2r8(iz2,jz2);
+            dx23             = _fjsp_sub_v2r8(ix2,jx3);
+            dy23             = _fjsp_sub_v2r8(iy2,jy3);
+            dz23             = _fjsp_sub_v2r8(iz2,jz3);
+            dx31             = _fjsp_sub_v2r8(ix3,jx1);
+            dy31             = _fjsp_sub_v2r8(iy3,jy1);
+            dz31             = _fjsp_sub_v2r8(iz3,jz1);
+            dx32             = _fjsp_sub_v2r8(ix3,jx2);
+            dy32             = _fjsp_sub_v2r8(iy3,jy2);
+            dz32             = _fjsp_sub_v2r8(iz3,jz2);
+            dx33             = _fjsp_sub_v2r8(ix3,jx3);
+            dy33             = _fjsp_sub_v2r8(iy3,jy3);
+            dz33             = _fjsp_sub_v2r8(iz3,jz3);
+
+            /* Calculate squared distance and things based on it */
+            rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+            rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+            rsq13            = gmx_fjsp_calc_rsq_v2r8(dx13,dy13,dz13);
+            rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+            rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+            rsq23            = gmx_fjsp_calc_rsq_v2r8(dx23,dy23,dz23);
+            rsq31            = gmx_fjsp_calc_rsq_v2r8(dx31,dy31,dz31);
+            rsq32            = gmx_fjsp_calc_rsq_v2r8(dx32,dy32,dz32);
+            rsq33            = gmx_fjsp_calc_rsq_v2r8(dx33,dy33,dz33);
+
+            rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+            rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+            rinv13           = gmx_fjsp_invsqrt_v2r8(rsq13);
+            rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+            rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+            rinv23           = gmx_fjsp_invsqrt_v2r8(rsq23);
+            rinv31           = gmx_fjsp_invsqrt_v2r8(rsq31);
+            rinv32           = gmx_fjsp_invsqrt_v2r8(rsq32);
+            rinv33           = gmx_fjsp_invsqrt_v2r8(rsq33);
+
+            rinvsq11         = _fjsp_mul_v2r8(rinv11,rinv11);
+            rinvsq12         = _fjsp_mul_v2r8(rinv12,rinv12);
+            rinvsq13         = _fjsp_mul_v2r8(rinv13,rinv13);
+            rinvsq21         = _fjsp_mul_v2r8(rinv21,rinv21);
+            rinvsq22         = _fjsp_mul_v2r8(rinv22,rinv22);
+            rinvsq23         = _fjsp_mul_v2r8(rinv23,rinv23);
+            rinvsq31         = _fjsp_mul_v2r8(rinv31,rinv31);
+            rinvsq32         = _fjsp_mul_v2r8(rinv32,rinv32);
+            rinvsq33         = _fjsp_mul_v2r8(rinv33,rinv33);
+
+            fjx1             = _fjsp_setzero_v2r8();
+            fjy1             = _fjsp_setzero_v2r8();
+            fjz1             = _fjsp_setzero_v2r8();
+            fjx2             = _fjsp_setzero_v2r8();
+            fjy2             = _fjsp_setzero_v2r8();
+            fjz2             = _fjsp_setzero_v2r8();
+            fjx3             = _fjsp_setzero_v2r8();
+            fjy3             = _fjsp_setzero_v2r8();
+            fjz3             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq11,rinv11);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq11);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+            
+            fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq12,rinv12);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq12);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+            
+            fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq13,rinv13);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq13);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx13,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy13,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz13,fscal,fiz1);
+            
+            fjx3             = _fjsp_madd_v2r8(dx13,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy13,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz13,fscal,fjz3);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq21,rinv21);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq21);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+            
+            fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq22,rinv22);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq22);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+            
+            fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq23,rinv23);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq23);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx23,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy23,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz23,fscal,fiz2);
+            
+            fjx3             = _fjsp_madd_v2r8(dx23,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy23,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz23,fscal,fjz3);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq31,rinv31);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq31);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx31,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy31,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz31,fscal,fiz3);
+            
+            fjx1             = _fjsp_madd_v2r8(dx31,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy31,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz31,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq32,rinv32);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq32);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx32,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy32,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz32,fscal,fiz3);
+            
+            fjx2             = _fjsp_madd_v2r8(dx32,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy32,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz32,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq33,rinv33);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq33);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx33,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy33,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz33,fscal,fiz3);
+            
+            fjx3             = _fjsp_madd_v2r8(dx33,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy33,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz33,fscal,fjz3);
+
+            gmx_fjsp_decrement_3rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA+DIM,f+j_coord_offsetB+DIM,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+
+            /* Inner loop uses 279 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_3rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA+DIM,
+                                              &jx1,&jy1,&jz1,&jx2,&jy2,&jz2,&jx3,&jy3,&jz3);
+
+            /* Calculate displacement vector */
+            dx11             = _fjsp_sub_v2r8(ix1,jx1);
+            dy11             = _fjsp_sub_v2r8(iy1,jy1);
+            dz11             = _fjsp_sub_v2r8(iz1,jz1);
+            dx12             = _fjsp_sub_v2r8(ix1,jx2);
+            dy12             = _fjsp_sub_v2r8(iy1,jy2);
+            dz12             = _fjsp_sub_v2r8(iz1,jz2);
+            dx13             = _fjsp_sub_v2r8(ix1,jx3);
+            dy13             = _fjsp_sub_v2r8(iy1,jy3);
+            dz13             = _fjsp_sub_v2r8(iz1,jz3);
+            dx21             = _fjsp_sub_v2r8(ix2,jx1);
+            dy21             = _fjsp_sub_v2r8(iy2,jy1);
+            dz21             = _fjsp_sub_v2r8(iz2,jz1);
+            dx22             = _fjsp_sub_v2r8(ix2,jx2);
+            dy22             = _fjsp_sub_v2r8(iy2,jy2);
+            dz22             = _fjsp_sub_v2r8(iz2,jz2);
+            dx23             = _fjsp_sub_v2r8(ix2,jx3);
+            dy23             = _fjsp_sub_v2r8(iy2,jy3);
+            dz23             = _fjsp_sub_v2r8(iz2,jz3);
+            dx31             = _fjsp_sub_v2r8(ix3,jx1);
+            dy31             = _fjsp_sub_v2r8(iy3,jy1);
+            dz31             = _fjsp_sub_v2r8(iz3,jz1);
+            dx32             = _fjsp_sub_v2r8(ix3,jx2);
+            dy32             = _fjsp_sub_v2r8(iy3,jy2);
+            dz32             = _fjsp_sub_v2r8(iz3,jz2);
+            dx33             = _fjsp_sub_v2r8(ix3,jx3);
+            dy33             = _fjsp_sub_v2r8(iy3,jy3);
+            dz33             = _fjsp_sub_v2r8(iz3,jz3);
+
+            /* Calculate squared distance and things based on it */
+            rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+            rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+            rsq13            = gmx_fjsp_calc_rsq_v2r8(dx13,dy13,dz13);
+            rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+            rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+            rsq23            = gmx_fjsp_calc_rsq_v2r8(dx23,dy23,dz23);
+            rsq31            = gmx_fjsp_calc_rsq_v2r8(dx31,dy31,dz31);
+            rsq32            = gmx_fjsp_calc_rsq_v2r8(dx32,dy32,dz32);
+            rsq33            = gmx_fjsp_calc_rsq_v2r8(dx33,dy33,dz33);
+
+            rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+            rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+            rinv13           = gmx_fjsp_invsqrt_v2r8(rsq13);
+            rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+            rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+            rinv23           = gmx_fjsp_invsqrt_v2r8(rsq23);
+            rinv31           = gmx_fjsp_invsqrt_v2r8(rsq31);
+            rinv32           = gmx_fjsp_invsqrt_v2r8(rsq32);
+            rinv33           = gmx_fjsp_invsqrt_v2r8(rsq33);
+
+            rinvsq11         = _fjsp_mul_v2r8(rinv11,rinv11);
+            rinvsq12         = _fjsp_mul_v2r8(rinv12,rinv12);
+            rinvsq13         = _fjsp_mul_v2r8(rinv13,rinv13);
+            rinvsq21         = _fjsp_mul_v2r8(rinv21,rinv21);
+            rinvsq22         = _fjsp_mul_v2r8(rinv22,rinv22);
+            rinvsq23         = _fjsp_mul_v2r8(rinv23,rinv23);
+            rinvsq31         = _fjsp_mul_v2r8(rinv31,rinv31);
+            rinvsq32         = _fjsp_mul_v2r8(rinv32,rinv32);
+            rinvsq33         = _fjsp_mul_v2r8(rinv33,rinv33);
+
+            fjx1             = _fjsp_setzero_v2r8();
+            fjy1             = _fjsp_setzero_v2r8();
+            fjz1             = _fjsp_setzero_v2r8();
+            fjx2             = _fjsp_setzero_v2r8();
+            fjy2             = _fjsp_setzero_v2r8();
+            fjz2             = _fjsp_setzero_v2r8();
+            fjx3             = _fjsp_setzero_v2r8();
+            fjy3             = _fjsp_setzero_v2r8();
+            fjz3             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq11,rinv11);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq11);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+            
+            fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq12,rinv12);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq12);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+            
+            fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq13,rinv13);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq13);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx13,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy13,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz13,fscal,fiz1);
+            
+            fjx3             = _fjsp_madd_v2r8(dx13,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy13,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz13,fscal,fjz3);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq21,rinv21);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq21);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+            
+            fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq22,rinv22);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq22);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+            
+            fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq23,rinv23);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq23);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx23,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy23,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz23,fscal,fiz2);
+            
+            fjx3             = _fjsp_madd_v2r8(dx23,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy23,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz23,fscal,fjz3);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq31,rinv31);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq31);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx31,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy31,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz31,fscal,fiz3);
+            
+            fjx1             = _fjsp_madd_v2r8(dx31,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy31,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz31,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq32,rinv32);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq32);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx32,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy32,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz32,fscal,fiz3);
+            
+            fjx2             = _fjsp_madd_v2r8(dx32,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy32,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz32,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq33,rinv33);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq33);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx33,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy33,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz33,fscal,fiz3);
+            
+            fjx3             = _fjsp_madd_v2r8(dx33,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy33,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz33,fscal,fjz3);
+
+            gmx_fjsp_decrement_3rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA+DIM,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+
+            /* Inner loop uses 279 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_fjsp_update_iforce_3atom_swizzle_v2r8(fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+                                              f+i_coord_offset+DIM,fshift+i_shift_offset);
+
+        ggid                        = gid[iidx];
+        /* Update potential energies */
+        gmx_fjsp_update_1pot_v2r8(velecsum,kernel_data->energygrp_elec+ggid);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 19 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W4W4_VF,outeriter*19 + inneriter*279);
+}
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecCoul_VdwNone_GeomW4W4_F_sparc64_hpc_ace_double
+ * Electrostatics interaction: Coulomb
+ * VdW interaction:            None
+ * Geometry:                   Water4-Water4
+ * Calculate force/pot:        Force
+ */
+void
+nb_kernel_ElecCoul_VdwNone_GeomW4W4_F_sparc64_hpc_ace_double
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset1;
+    _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwioffset3;
+    _fjsp_v2r8       ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+    int              vdwjidx1A,vdwjidx1B;
+    _fjsp_v2r8       jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+    int              vdwjidx2A,vdwjidx2B;
+    _fjsp_v2r8       jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+    int              vdwjidx3A,vdwjidx3B;
+    _fjsp_v2r8       jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
+    _fjsp_v2r8       dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+    _fjsp_v2r8       dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+    _fjsp_v2r8       dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
+    _fjsp_v2r8       dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+    _fjsp_v2r8       dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+    _fjsp_v2r8       dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
+    _fjsp_v2r8       dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
+    _fjsp_v2r8       dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
+    _fjsp_v2r8       dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
+    _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    _fjsp_v2r8       itab_tmp;
+    _fjsp_v2r8       dummy_mask,cutoff_mask;
+    _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+    _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+    union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+    charge           = mdatoms->chargeA;
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+    iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+    iq3              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+3]));
+
+    jq1              = gmx_fjsp_set1_v2r8(charge[inr+1]);
+    jq2              = gmx_fjsp_set1_v2r8(charge[inr+2]);
+    jq3              = gmx_fjsp_set1_v2r8(charge[inr+3]);
+    qq11             = _fjsp_mul_v2r8(iq1,jq1);
+    qq12             = _fjsp_mul_v2r8(iq1,jq2);
+    qq13             = _fjsp_mul_v2r8(iq1,jq3);
+    qq21             = _fjsp_mul_v2r8(iq2,jq1);
+    qq22             = _fjsp_mul_v2r8(iq2,jq2);
+    qq23             = _fjsp_mul_v2r8(iq2,jq3);
+    qq31             = _fjsp_mul_v2r8(iq3,jq1);
+    qq32             = _fjsp_mul_v2r8(iq3,jq2);
+    qq33             = _fjsp_mul_v2r8(iq3,jq3);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_fjsp_load_shift_and_3rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset+DIM,
+                                                 &ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
+
+        fix1             = _fjsp_setzero_v2r8();
+        fiy1             = _fjsp_setzero_v2r8();
+        fiz1             = _fjsp_setzero_v2r8();
+        fix2             = _fjsp_setzero_v2r8();
+        fiy2             = _fjsp_setzero_v2r8();
+        fiz2             = _fjsp_setzero_v2r8();
+        fix3             = _fjsp_setzero_v2r8();
+        fiy3             = _fjsp_setzero_v2r8();
+        fiz3             = _fjsp_setzero_v2r8();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_3rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA+DIM,x+j_coord_offsetB+DIM,
+                                              &jx1,&jy1,&jz1,&jx2,&jy2,&jz2,&jx3,&jy3,&jz3);
+
+            /* Calculate displacement vector */
+            dx11             = _fjsp_sub_v2r8(ix1,jx1);
+            dy11             = _fjsp_sub_v2r8(iy1,jy1);
+            dz11             = _fjsp_sub_v2r8(iz1,jz1);
+            dx12             = _fjsp_sub_v2r8(ix1,jx2);
+            dy12             = _fjsp_sub_v2r8(iy1,jy2);
+            dz12             = _fjsp_sub_v2r8(iz1,jz2);
+            dx13             = _fjsp_sub_v2r8(ix1,jx3);
+            dy13             = _fjsp_sub_v2r8(iy1,jy3);
+            dz13             = _fjsp_sub_v2r8(iz1,jz3);
+            dx21             = _fjsp_sub_v2r8(ix2,jx1);
+            dy21             = _fjsp_sub_v2r8(iy2,jy1);
+            dz21             = _fjsp_sub_v2r8(iz2,jz1);
+            dx22             = _fjsp_sub_v2r8(ix2,jx2);
+            dy22             = _fjsp_sub_v2r8(iy2,jy2);
+            dz22             = _fjsp_sub_v2r8(iz2,jz2);
+            dx23             = _fjsp_sub_v2r8(ix2,jx3);
+            dy23             = _fjsp_sub_v2r8(iy2,jy3);
+            dz23             = _fjsp_sub_v2r8(iz2,jz3);
+            dx31             = _fjsp_sub_v2r8(ix3,jx1);
+            dy31             = _fjsp_sub_v2r8(iy3,jy1);
+            dz31             = _fjsp_sub_v2r8(iz3,jz1);
+            dx32             = _fjsp_sub_v2r8(ix3,jx2);
+            dy32             = _fjsp_sub_v2r8(iy3,jy2);
+            dz32             = _fjsp_sub_v2r8(iz3,jz2);
+            dx33             = _fjsp_sub_v2r8(ix3,jx3);
+            dy33             = _fjsp_sub_v2r8(iy3,jy3);
+            dz33             = _fjsp_sub_v2r8(iz3,jz3);
+
+            /* Calculate squared distance and things based on it */
+            rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+            rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+            rsq13            = gmx_fjsp_calc_rsq_v2r8(dx13,dy13,dz13);
+            rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+            rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+            rsq23            = gmx_fjsp_calc_rsq_v2r8(dx23,dy23,dz23);
+            rsq31            = gmx_fjsp_calc_rsq_v2r8(dx31,dy31,dz31);
+            rsq32            = gmx_fjsp_calc_rsq_v2r8(dx32,dy32,dz32);
+            rsq33            = gmx_fjsp_calc_rsq_v2r8(dx33,dy33,dz33);
+
+            rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+            rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+            rinv13           = gmx_fjsp_invsqrt_v2r8(rsq13);
+            rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+            rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+            rinv23           = gmx_fjsp_invsqrt_v2r8(rsq23);
+            rinv31           = gmx_fjsp_invsqrt_v2r8(rsq31);
+            rinv32           = gmx_fjsp_invsqrt_v2r8(rsq32);
+            rinv33           = gmx_fjsp_invsqrt_v2r8(rsq33);
+
+            rinvsq11         = _fjsp_mul_v2r8(rinv11,rinv11);
+            rinvsq12         = _fjsp_mul_v2r8(rinv12,rinv12);
+            rinvsq13         = _fjsp_mul_v2r8(rinv13,rinv13);
+            rinvsq21         = _fjsp_mul_v2r8(rinv21,rinv21);
+            rinvsq22         = _fjsp_mul_v2r8(rinv22,rinv22);
+            rinvsq23         = _fjsp_mul_v2r8(rinv23,rinv23);
+            rinvsq31         = _fjsp_mul_v2r8(rinv31,rinv31);
+            rinvsq32         = _fjsp_mul_v2r8(rinv32,rinv32);
+            rinvsq33         = _fjsp_mul_v2r8(rinv33,rinv33);
+
+            fjx1             = _fjsp_setzero_v2r8();
+            fjy1             = _fjsp_setzero_v2r8();
+            fjz1             = _fjsp_setzero_v2r8();
+            fjx2             = _fjsp_setzero_v2r8();
+            fjy2             = _fjsp_setzero_v2r8();
+            fjz2             = _fjsp_setzero_v2r8();
+            fjx3             = _fjsp_setzero_v2r8();
+            fjy3             = _fjsp_setzero_v2r8();
+            fjz3             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq11,rinv11);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq11);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+            
+            fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq12,rinv12);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq12);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+            
+            fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq13,rinv13);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq13);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx13,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy13,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz13,fscal,fiz1);
+            
+            fjx3             = _fjsp_madd_v2r8(dx13,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy13,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz13,fscal,fjz3);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq21,rinv21);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq21);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+            
+            fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq22,rinv22);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq22);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+            
+            fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq23,rinv23);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq23);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx23,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy23,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz23,fscal,fiz2);
+            
+            fjx3             = _fjsp_madd_v2r8(dx23,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy23,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz23,fscal,fjz3);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq31,rinv31);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq31);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx31,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy31,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz31,fscal,fiz3);
+            
+            fjx1             = _fjsp_madd_v2r8(dx31,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy31,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz31,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq32,rinv32);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq32);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx32,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy32,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz32,fscal,fiz3);
+            
+            fjx2             = _fjsp_madd_v2r8(dx32,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy32,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz32,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq33,rinv33);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq33);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx33,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy33,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz33,fscal,fiz3);
+            
+            fjx3             = _fjsp_madd_v2r8(dx33,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy33,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz33,fscal,fjz3);
+
+            gmx_fjsp_decrement_3rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA+DIM,f+j_coord_offsetB+DIM,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+
+            /* Inner loop uses 270 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_3rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA+DIM,
+                                              &jx1,&jy1,&jz1,&jx2,&jy2,&jz2,&jx3,&jy3,&jz3);
+
+            /* Calculate displacement vector */
+            dx11             = _fjsp_sub_v2r8(ix1,jx1);
+            dy11             = _fjsp_sub_v2r8(iy1,jy1);
+            dz11             = _fjsp_sub_v2r8(iz1,jz1);
+            dx12             = _fjsp_sub_v2r8(ix1,jx2);
+            dy12             = _fjsp_sub_v2r8(iy1,jy2);
+            dz12             = _fjsp_sub_v2r8(iz1,jz2);
+            dx13             = _fjsp_sub_v2r8(ix1,jx3);
+            dy13             = _fjsp_sub_v2r8(iy1,jy3);
+            dz13             = _fjsp_sub_v2r8(iz1,jz3);
+            dx21             = _fjsp_sub_v2r8(ix2,jx1);
+            dy21             = _fjsp_sub_v2r8(iy2,jy1);
+            dz21             = _fjsp_sub_v2r8(iz2,jz1);
+            dx22             = _fjsp_sub_v2r8(ix2,jx2);
+            dy22             = _fjsp_sub_v2r8(iy2,jy2);
+            dz22             = _fjsp_sub_v2r8(iz2,jz2);
+            dx23             = _fjsp_sub_v2r8(ix2,jx3);
+            dy23             = _fjsp_sub_v2r8(iy2,jy3);
+            dz23             = _fjsp_sub_v2r8(iz2,jz3);
+            dx31             = _fjsp_sub_v2r8(ix3,jx1);
+            dy31             = _fjsp_sub_v2r8(iy3,jy1);
+            dz31             = _fjsp_sub_v2r8(iz3,jz1);
+            dx32             = _fjsp_sub_v2r8(ix3,jx2);
+            dy32             = _fjsp_sub_v2r8(iy3,jy2);
+            dz32             = _fjsp_sub_v2r8(iz3,jz2);
+            dx33             = _fjsp_sub_v2r8(ix3,jx3);
+            dy33             = _fjsp_sub_v2r8(iy3,jy3);
+            dz33             = _fjsp_sub_v2r8(iz3,jz3);
+
+            /* Calculate squared distance and things based on it */
+            rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+            rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+            rsq13            = gmx_fjsp_calc_rsq_v2r8(dx13,dy13,dz13);
+            rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+            rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+            rsq23            = gmx_fjsp_calc_rsq_v2r8(dx23,dy23,dz23);
+            rsq31            = gmx_fjsp_calc_rsq_v2r8(dx31,dy31,dz31);
+            rsq32            = gmx_fjsp_calc_rsq_v2r8(dx32,dy32,dz32);
+            rsq33            = gmx_fjsp_calc_rsq_v2r8(dx33,dy33,dz33);
+
+            rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+            rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+            rinv13           = gmx_fjsp_invsqrt_v2r8(rsq13);
+            rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+            rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+            rinv23           = gmx_fjsp_invsqrt_v2r8(rsq23);
+            rinv31           = gmx_fjsp_invsqrt_v2r8(rsq31);
+            rinv32           = gmx_fjsp_invsqrt_v2r8(rsq32);
+            rinv33           = gmx_fjsp_invsqrt_v2r8(rsq33);
+
+            rinvsq11         = _fjsp_mul_v2r8(rinv11,rinv11);
+            rinvsq12         = _fjsp_mul_v2r8(rinv12,rinv12);
+            rinvsq13         = _fjsp_mul_v2r8(rinv13,rinv13);
+            rinvsq21         = _fjsp_mul_v2r8(rinv21,rinv21);
+            rinvsq22         = _fjsp_mul_v2r8(rinv22,rinv22);
+            rinvsq23         = _fjsp_mul_v2r8(rinv23,rinv23);
+            rinvsq31         = _fjsp_mul_v2r8(rinv31,rinv31);
+            rinvsq32         = _fjsp_mul_v2r8(rinv32,rinv32);
+            rinvsq33         = _fjsp_mul_v2r8(rinv33,rinv33);
+
+            fjx1             = _fjsp_setzero_v2r8();
+            fjy1             = _fjsp_setzero_v2r8();
+            fjz1             = _fjsp_setzero_v2r8();
+            fjx2             = _fjsp_setzero_v2r8();
+            fjy2             = _fjsp_setzero_v2r8();
+            fjz2             = _fjsp_setzero_v2r8();
+            fjx3             = _fjsp_setzero_v2r8();
+            fjy3             = _fjsp_setzero_v2r8();
+            fjz3             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq11,rinv11);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq11);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+            
+            fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq12,rinv12);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq12);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+            
+            fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq13,rinv13);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq13);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx13,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy13,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz13,fscal,fiz1);
+            
+            fjx3             = _fjsp_madd_v2r8(dx13,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy13,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz13,fscal,fjz3);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq21,rinv21);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq21);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+            
+            fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq22,rinv22);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq22);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+            
+            fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq23,rinv23);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq23);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx23,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy23,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz23,fscal,fiz2);
+            
+            fjx3             = _fjsp_madd_v2r8(dx23,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy23,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz23,fscal,fjz3);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq31,rinv31);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq31);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx31,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy31,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz31,fscal,fiz3);
+            
+            fjx1             = _fjsp_madd_v2r8(dx31,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy31,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz31,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq32,rinv32);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq32);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx32,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy32,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz32,fscal,fiz3);
+            
+            fjx2             = _fjsp_madd_v2r8(dx32,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy32,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz32,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq33,rinv33);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq33);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx33,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy33,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz33,fscal,fiz3);
+            
+            fjx3             = _fjsp_madd_v2r8(dx33,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy33,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz33,fscal,fjz3);
+
+            gmx_fjsp_decrement_3rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA+DIM,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+
+            /* Inner loop uses 270 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_fjsp_update_iforce_3atom_swizzle_v2r8(fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+                                              f+i_coord_offset+DIM,fshift+i_shift_offset);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 18 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W4W4_F,outeriter*18 + inneriter*270);
+}
diff --git a/src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecEwSh_VdwLJSh_GeomP1P1_sparc64_hpc_ace_double.c b/src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecEwSh_VdwLJSh_GeomP1P1_sparc64_hpc_ace_double.c
new file mode 100644 (file)
index 0000000..c6d3ca7
--- /dev/null
@@ -0,0 +1,672 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2012, by the GROMACS development team, led by
+ * David van der Spoel, Berk Hess, Erik Lindahl, and including many
+ * others, as listed in the AUTHORS file in the top-level source
+ * directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*
+ * Note: this file was generated by the GROMACS sparc64_hpc_ace_double kernel generator.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "kernelutil_sparc64_hpc_ace_double.h"
+
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecEwSh_VdwLJSh_GeomP1P1_VF_sparc64_hpc_ace_double
+ * Electrostatics interaction: Ewald
+ * VdW interaction:            LennardJones
+ * Geometry:                   Particle-Particle
+ * Calculate force/pot:        PotentialAndForce
+ */
+void
+nb_kernel_ElecEwSh_VdwLJSh_GeomP1P1_VF_sparc64_hpc_ace_double
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwjidx0A,vdwjidx0B;
+    _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
+    _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
+    _fjsp_v2r8       ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+    real             *ewtab;
+    _fjsp_v2r8       itab_tmp;
+    _fjsp_v2r8       dummy_mask,cutoff_mask;
+    _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+    _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+    union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+
+    sh_ewald         = gmx_fjsp_set1_v2r8(fr->ic->sh_ewald);
+    ewtab            = fr->ic->tabq_coul_FDV0;
+    ewtabscale       = gmx_fjsp_set1_v2r8(fr->ic->tabq_scale);
+    ewtabhalfspace   = gmx_fjsp_set1_v2r8(0.5/fr->ic->tabq_scale);
+
+    /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+    rcutoff_scalar   = fr->rcoulomb;
+    rcutoff          = gmx_fjsp_set1_v2r8(rcutoff_scalar);
+    rcutoff2         = _fjsp_mul_v2r8(rcutoff,rcutoff);
+
+    sh_vdw_invrcut6  = gmx_fjsp_set1_v2r8(fr->ic->sh_invrc6);
+    rvdw             = gmx_fjsp_set1_v2r8(fr->rvdw);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_fjsp_load_shift_and_1rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,&ix0,&iy0,&iz0);
+
+        fix0             = _fjsp_setzero_v2r8();
+        fiy0             = _fjsp_setzero_v2r8();
+        fiz0             = _fjsp_setzero_v2r8();
+
+        /* Load parameters for i particles */
+        iq0              = _fjsp_mul_v2r8(facel,gmx_fjsp_load1_v2r8(charge+inr+0));
+        vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+        /* Reset potential sums */
+        velecsum         = _fjsp_setzero_v2r8();
+        vvdwsum          = _fjsp_setzero_v2r8();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_fjsp_load_2real_swizzle_v2r8(charge+jnrA+0,charge+jnrB+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
+            {
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _fjsp_mul_v2r8(iq0,jq0);
+            gmx_fjsp_load_2pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,&c6_00,&c12_00);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r00,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq00,_fjsp_sub_v2r8(_fjsp_sub_v2r8(rinv00,sh_ewald),velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq00,rinv00),_fjsp_sub_v2r8(rinvsq00,felec));
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+            vvdw6            = _fjsp_mul_v2r8(c6_00,rinvsix);
+            vvdw12           = _fjsp_mul_v2r8(c12_00,_fjsp_mul_v2r8(rinvsix,rinvsix));
+            vvdw             = _fjsp_msub_v2r8(_fjsp_nmsub_v2r8(c12_00,_fjsp_mul_v2r8(sh_vdw_invrcut6,sh_vdw_invrcut6),vvdw12),one_twelfth,
+                                           _fjsp_mul_v2r8(_fjsp_nmsub_v2r8( c6_00,sh_vdw_invrcut6,vvdw6),one_sixth));
+            fvdw             = _fjsp_mul_v2r8(_fjsp_sub_v2r8(vvdw12,vvdw6),rinvsq00);
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+            vvdw             = _fjsp_and_v2r8(vvdw,cutoff_mask);
+            vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
+
+            fscal            = _fjsp_add_v2r8(felec,fvdw);
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            gmx_fjsp_decrement_fma_1rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fscal,dx00,dy00,dz00);
+
+            }
+
+            /* Inner loop uses 67 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+
+            /* Load parameters for j particles */
+            jq0              = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),charge+jnrA+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
+            {
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _fjsp_mul_v2r8(iq0,jq0);
+            gmx_fjsp_load_1pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,&c6_00,&c12_00);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r00,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq00,_fjsp_sub_v2r8(_fjsp_sub_v2r8(rinv00,sh_ewald),velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq00,rinv00),_fjsp_sub_v2r8(rinvsq00,felec));
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+            vvdw6            = _fjsp_mul_v2r8(c6_00,rinvsix);
+            vvdw12           = _fjsp_mul_v2r8(c12_00,_fjsp_mul_v2r8(rinvsix,rinvsix));
+            vvdw             = _fjsp_msub_v2r8(_fjsp_nmsub_v2r8(c12_00,_fjsp_mul_v2r8(sh_vdw_invrcut6,sh_vdw_invrcut6),vvdw12),one_twelfth,
+                                           _fjsp_mul_v2r8(_fjsp_nmsub_v2r8( c6_00,sh_vdw_invrcut6,vvdw6),one_sixth));
+            fvdw             = _fjsp_mul_v2r8(_fjsp_sub_v2r8(vvdw12,vvdw6),rinvsq00);
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+            vvdw             = _fjsp_and_v2r8(vvdw,cutoff_mask);
+            vvdw             = _fjsp_unpacklo_v2r8(vvdw,_fjsp_setzero_v2r8());
+            vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
+
+            fscal            = _fjsp_add_v2r8(felec,fvdw);
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            gmx_fjsp_decrement_fma_1rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fscal,dx00,dy00,dz00);
+
+            }
+
+            /* Inner loop uses 67 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_fjsp_update_iforce_1atom_swizzle_v2r8(fix0,fiy0,fiz0,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        ggid                        = gid[iidx];
+        /* Update potential energies */
+        gmx_fjsp_update_1pot_v2r8(velecsum,kernel_data->energygrp_elec+ggid);
+        gmx_fjsp_update_1pot_v2r8(vvdwsum,kernel_data->energygrp_vdw+ggid);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 9 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_VF,outeriter*9 + inneriter*67);
+}
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecEwSh_VdwLJSh_GeomP1P1_F_sparc64_hpc_ace_double
+ * Electrostatics interaction: Ewald
+ * VdW interaction:            LennardJones
+ * Geometry:                   Particle-Particle
+ * Calculate force/pot:        Force
+ */
+void
+nb_kernel_ElecEwSh_VdwLJSh_GeomP1P1_F_sparc64_hpc_ace_double
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwjidx0A,vdwjidx0B;
+    _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
+    _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
+    _fjsp_v2r8       ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+    real             *ewtab;
+    _fjsp_v2r8       itab_tmp;
+    _fjsp_v2r8       dummy_mask,cutoff_mask;
+    _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+    _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+    union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+
+    sh_ewald         = gmx_fjsp_set1_v2r8(fr->ic->sh_ewald);
+    ewtab            = fr->ic->tabq_coul_F;
+    ewtabscale       = gmx_fjsp_set1_v2r8(fr->ic->tabq_scale);
+    ewtabhalfspace   = gmx_fjsp_set1_v2r8(0.5/fr->ic->tabq_scale);
+
+    /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+    rcutoff_scalar   = fr->rcoulomb;
+    rcutoff          = gmx_fjsp_set1_v2r8(rcutoff_scalar);
+    rcutoff2         = _fjsp_mul_v2r8(rcutoff,rcutoff);
+
+    sh_vdw_invrcut6  = gmx_fjsp_set1_v2r8(fr->ic->sh_invrc6);
+    rvdw             = gmx_fjsp_set1_v2r8(fr->rvdw);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_fjsp_load_shift_and_1rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,&ix0,&iy0,&iz0);
+
+        fix0             = _fjsp_setzero_v2r8();
+        fiy0             = _fjsp_setzero_v2r8();
+        fiz0             = _fjsp_setzero_v2r8();
+
+        /* Load parameters for i particles */
+        iq0              = _fjsp_mul_v2r8(facel,gmx_fjsp_load1_v2r8(charge+inr+0));
+        vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_fjsp_load_2real_swizzle_v2r8(charge+jnrA+0,charge+jnrB+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
+            {
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _fjsp_mul_v2r8(iq0,jq0);
+            gmx_fjsp_load_2pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,&c6_00,&c12_00);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r00,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
+                                         &ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq00,rinv00),_fjsp_sub_v2r8(rinvsq00,felec));
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+            fvdw             = _fjsp_mul_v2r8(_fjsp_msub_v2r8(c12_00,rinvsix,c6_00),_fjsp_mul_v2r8(rinvsix,rinvsq00));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
+
+            fscal            = _fjsp_add_v2r8(felec,fvdw);
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            gmx_fjsp_decrement_fma_1rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fscal,dx00,dy00,dz00);
+
+            }
+
+            /* Inner loop uses 49 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+
+            /* Load parameters for j particles */
+            jq0              = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),charge+jnrA+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
+            {
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _fjsp_mul_v2r8(iq0,jq0);
+            gmx_fjsp_load_1pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,&c6_00,&c12_00);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r00,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq00,rinv00),_fjsp_sub_v2r8(rinvsq00,felec));
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+            fvdw             = _fjsp_mul_v2r8(_fjsp_msub_v2r8(c12_00,rinvsix,c6_00),_fjsp_mul_v2r8(rinvsix,rinvsq00));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
+
+            fscal            = _fjsp_add_v2r8(felec,fvdw);
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            gmx_fjsp_decrement_fma_1rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fscal,dx00,dy00,dz00);
+
+            }
+
+            /* Inner loop uses 49 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_fjsp_update_iforce_1atom_swizzle_v2r8(fix0,fiy0,fiz0,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 7 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_F,outeriter*7 + inneriter*49);
+}
diff --git a/src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecEwSh_VdwLJSh_GeomW3P1_sparc64_hpc_ace_double.c b/src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecEwSh_VdwLJSh_GeomW3P1_sparc64_hpc_ace_double.c
new file mode 100644 (file)
index 0000000..e9d6af0
--- /dev/null
@@ -0,0 +1,1168 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2012, by the GROMACS development team, led by
+ * David van der Spoel, Berk Hess, Erik Lindahl, and including many
+ * others, as listed in the AUTHORS file in the top-level source
+ * directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*
+ * Note: this file was generated by the GROMACS sparc64_hpc_ace_double kernel generator.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "kernelutil_sparc64_hpc_ace_double.h"
+
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecEwSh_VdwLJSh_GeomW3P1_VF_sparc64_hpc_ace_double
+ * Electrostatics interaction: Ewald
+ * VdW interaction:            LennardJones
+ * Geometry:                   Water3-Particle
+ * Calculate force/pot:        PotentialAndForce
+ */
+void
+nb_kernel_ElecEwSh_VdwLJSh_GeomW3P1_VF_sparc64_hpc_ace_double
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwjidx0A,vdwjidx0B;
+    _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    _fjsp_v2r8       dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    _fjsp_v2r8       dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
+    _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
+    _fjsp_v2r8       ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+    real             *ewtab;
+    _fjsp_v2r8       itab_tmp;
+    _fjsp_v2r8       dummy_mask,cutoff_mask;
+    _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+    _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+    union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+
+    sh_ewald         = gmx_fjsp_set1_v2r8(fr->ic->sh_ewald);
+    ewtab            = fr->ic->tabq_coul_FDV0;
+    ewtabscale       = gmx_fjsp_set1_v2r8(fr->ic->tabq_scale);
+    ewtabhalfspace   = gmx_fjsp_set1_v2r8(0.5/fr->ic->tabq_scale);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq0              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+0]));
+    iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+    iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+    vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+    /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+    rcutoff_scalar   = fr->rcoulomb;
+    rcutoff          = gmx_fjsp_set1_v2r8(rcutoff_scalar);
+    rcutoff2         = _fjsp_mul_v2r8(rcutoff,rcutoff);
+
+    sh_vdw_invrcut6  = gmx_fjsp_set1_v2r8(fr->ic->sh_invrc6);
+    rvdw             = gmx_fjsp_set1_v2r8(fr->rvdw);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_fjsp_load_shift_and_3rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
+
+        fix0             = _fjsp_setzero_v2r8();
+        fiy0             = _fjsp_setzero_v2r8();
+        fiz0             = _fjsp_setzero_v2r8();
+        fix1             = _fjsp_setzero_v2r8();
+        fiy1             = _fjsp_setzero_v2r8();
+        fiz1             = _fjsp_setzero_v2r8();
+        fix2             = _fjsp_setzero_v2r8();
+        fiy2             = _fjsp_setzero_v2r8();
+        fiz2             = _fjsp_setzero_v2r8();
+
+        /* Reset potential sums */
+        velecsum         = _fjsp_setzero_v2r8();
+        vvdwsum          = _fjsp_setzero_v2r8();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+            rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+            rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_fjsp_load_2real_swizzle_v2r8(charge+jnrA+0,charge+jnrB+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
+            {
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _fjsp_mul_v2r8(iq0,jq0);
+            gmx_fjsp_load_2pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,&c6_00,&c12_00);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r00,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq00,_fjsp_sub_v2r8(_fjsp_sub_v2r8(rinv00,sh_ewald),velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq00,rinv00),_fjsp_sub_v2r8(rinvsq00,felec));
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+            vvdw6            = _fjsp_mul_v2r8(c6_00,rinvsix);
+            vvdw12           = _fjsp_mul_v2r8(c12_00,_fjsp_mul_v2r8(rinvsix,rinvsix));
+            vvdw             = _fjsp_msub_v2r8(_fjsp_nmsub_v2r8(c12_00,_fjsp_mul_v2r8(sh_vdw_invrcut6,sh_vdw_invrcut6),vvdw12),one_twelfth,
+                                           _fjsp_mul_v2r8(_fjsp_nmsub_v2r8( c6_00,sh_vdw_invrcut6,vvdw6),one_sixth));
+            fvdw             = _fjsp_mul_v2r8(_fjsp_sub_v2r8(vvdw12,vvdw6),rinvsq00);
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+            vvdw             = _fjsp_and_v2r8(vvdw,cutoff_mask);
+            vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
+
+            fscal            = _fjsp_add_v2r8(felec,fvdw);
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq10,rcutoff2))
+            {
+
+            r10              = _fjsp_mul_v2r8(rsq10,rinv10);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _fjsp_mul_v2r8(iq1,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r10,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq10,_fjsp_sub_v2r8(_fjsp_sub_v2r8(rinv10,sh_ewald),velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq10,rinv10),_fjsp_sub_v2r8(rinvsq10,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq10,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq20,rcutoff2))
+            {
+
+            r20              = _fjsp_mul_v2r8(rsq20,rinv20);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _fjsp_mul_v2r8(iq2,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r20,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq20,_fjsp_sub_v2r8(_fjsp_sub_v2r8(rinv20,sh_ewald),velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq20,rinv20),_fjsp_sub_v2r8(rinvsq20,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq20,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            }
+
+            gmx_fjsp_decrement_1rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 168 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+            rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+            rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+
+            /* Load parameters for j particles */
+            jq0              = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),charge+jnrA+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
+            {
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _fjsp_mul_v2r8(iq0,jq0);
+            gmx_fjsp_load_1pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,&c6_00,&c12_00);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r00,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq00,_fjsp_sub_v2r8(_fjsp_sub_v2r8(rinv00,sh_ewald),velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq00,rinv00),_fjsp_sub_v2r8(rinvsq00,felec));
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+            vvdw6            = _fjsp_mul_v2r8(c6_00,rinvsix);
+            vvdw12           = _fjsp_mul_v2r8(c12_00,_fjsp_mul_v2r8(rinvsix,rinvsix));
+            vvdw             = _fjsp_msub_v2r8(_fjsp_nmsub_v2r8(c12_00,_fjsp_mul_v2r8(sh_vdw_invrcut6,sh_vdw_invrcut6),vvdw12),one_twelfth,
+                                           _fjsp_mul_v2r8(_fjsp_nmsub_v2r8( c6_00,sh_vdw_invrcut6,vvdw6),one_sixth));
+            fvdw             = _fjsp_mul_v2r8(_fjsp_sub_v2r8(vvdw12,vvdw6),rinvsq00);
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+            vvdw             = _fjsp_and_v2r8(vvdw,cutoff_mask);
+            vvdw             = _fjsp_unpacklo_v2r8(vvdw,_fjsp_setzero_v2r8());
+            vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
+
+            fscal            = _fjsp_add_v2r8(felec,fvdw);
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq10,rcutoff2))
+            {
+
+            r10              = _fjsp_mul_v2r8(rsq10,rinv10);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _fjsp_mul_v2r8(iq1,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r10,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq10,_fjsp_sub_v2r8(_fjsp_sub_v2r8(rinv10,sh_ewald),velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq10,rinv10),_fjsp_sub_v2r8(rinvsq10,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq10,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq20,rcutoff2))
+            {
+
+            r20              = _fjsp_mul_v2r8(rsq20,rinv20);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _fjsp_mul_v2r8(iq2,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r20,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq20,_fjsp_sub_v2r8(_fjsp_sub_v2r8(rinv20,sh_ewald),velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq20,rinv20),_fjsp_sub_v2r8(rinvsq20,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq20,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            }
+
+            gmx_fjsp_decrement_1rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 168 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_fjsp_update_iforce_3atom_swizzle_v2r8(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        ggid                        = gid[iidx];
+        /* Update potential energies */
+        gmx_fjsp_update_1pot_v2r8(velecsum,kernel_data->energygrp_elec+ggid);
+        gmx_fjsp_update_1pot_v2r8(vvdwsum,kernel_data->energygrp_vdw+ggid);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 20 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3_VF,outeriter*20 + inneriter*168);
+}
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecEwSh_VdwLJSh_GeomW3P1_F_sparc64_hpc_ace_double
+ * Electrostatics interaction: Ewald
+ * VdW interaction:            LennardJones
+ * Geometry:                   Water3-Particle
+ * Calculate force/pot:        Force
+ */
+void
+nb_kernel_ElecEwSh_VdwLJSh_GeomW3P1_F_sparc64_hpc_ace_double
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwjidx0A,vdwjidx0B;
+    _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    _fjsp_v2r8       dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    _fjsp_v2r8       dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
+    _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
+    _fjsp_v2r8       ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+    real             *ewtab;
+    _fjsp_v2r8       itab_tmp;
+    _fjsp_v2r8       dummy_mask,cutoff_mask;
+    _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+    _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+    union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+
+    sh_ewald         = gmx_fjsp_set1_v2r8(fr->ic->sh_ewald);
+    ewtab            = fr->ic->tabq_coul_F;
+    ewtabscale       = gmx_fjsp_set1_v2r8(fr->ic->tabq_scale);
+    ewtabhalfspace   = gmx_fjsp_set1_v2r8(0.5/fr->ic->tabq_scale);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq0              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+0]));
+    iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+    iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+    vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+    /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+    rcutoff_scalar   = fr->rcoulomb;
+    rcutoff          = gmx_fjsp_set1_v2r8(rcutoff_scalar);
+    rcutoff2         = _fjsp_mul_v2r8(rcutoff,rcutoff);
+
+    sh_vdw_invrcut6  = gmx_fjsp_set1_v2r8(fr->ic->sh_invrc6);
+    rvdw             = gmx_fjsp_set1_v2r8(fr->rvdw);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_fjsp_load_shift_and_3rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
+
+        fix0             = _fjsp_setzero_v2r8();
+        fiy0             = _fjsp_setzero_v2r8();
+        fiz0             = _fjsp_setzero_v2r8();
+        fix1             = _fjsp_setzero_v2r8();
+        fiy1             = _fjsp_setzero_v2r8();
+        fiz1             = _fjsp_setzero_v2r8();
+        fix2             = _fjsp_setzero_v2r8();
+        fiy2             = _fjsp_setzero_v2r8();
+        fiz2             = _fjsp_setzero_v2r8();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+            rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+            rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_fjsp_load_2real_swizzle_v2r8(charge+jnrA+0,charge+jnrB+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
+            {
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _fjsp_mul_v2r8(iq0,jq0);
+            gmx_fjsp_load_2pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,&c6_00,&c12_00);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r00,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
+                                         &ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq00,rinv00),_fjsp_sub_v2r8(rinvsq00,felec));
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+            fvdw             = _fjsp_mul_v2r8(_fjsp_msub_v2r8(c12_00,rinvsix,c6_00),_fjsp_mul_v2r8(rinvsix,rinvsq00));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
+
+            fscal            = _fjsp_add_v2r8(felec,fvdw);
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq10,rcutoff2))
+            {
+
+            r10              = _fjsp_mul_v2r8(rsq10,rinv10);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _fjsp_mul_v2r8(iq1,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r10,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
+                                         &ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq10,rinv10),_fjsp_sub_v2r8(rinvsq10,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq10,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq20,rcutoff2))
+            {
+
+            r20              = _fjsp_mul_v2r8(rsq20,rinv20);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _fjsp_mul_v2r8(iq2,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r20,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
+                                         &ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq20,rinv20),_fjsp_sub_v2r8(rinvsq20,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq20,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            }
+
+            gmx_fjsp_decrement_1rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 136 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+            rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+            rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+
+            /* Load parameters for j particles */
+            jq0              = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),charge+jnrA+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
+            {
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _fjsp_mul_v2r8(iq0,jq0);
+            gmx_fjsp_load_1pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,&c6_00,&c12_00);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r00,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq00,rinv00),_fjsp_sub_v2r8(rinvsq00,felec));
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+            fvdw             = _fjsp_mul_v2r8(_fjsp_msub_v2r8(c12_00,rinvsix,c6_00),_fjsp_mul_v2r8(rinvsix,rinvsq00));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
+
+            fscal            = _fjsp_add_v2r8(felec,fvdw);
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq10,rcutoff2))
+            {
+
+            r10              = _fjsp_mul_v2r8(rsq10,rinv10);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _fjsp_mul_v2r8(iq1,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r10,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq10,rinv10),_fjsp_sub_v2r8(rinvsq10,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq10,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq20,rcutoff2))
+            {
+
+            r20              = _fjsp_mul_v2r8(rsq20,rinv20);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _fjsp_mul_v2r8(iq2,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r20,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq20,rinv20),_fjsp_sub_v2r8(rinvsq20,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq20,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            }
+
+            gmx_fjsp_decrement_1rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 136 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_fjsp_update_iforce_3atom_swizzle_v2r8(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 18 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3_F,outeriter*18 + inneriter*136);
+}
diff --git a/src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecEwSh_VdwLJSh_GeomW3W3_sparc64_hpc_ace_double.c b/src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecEwSh_VdwLJSh_GeomW3W3_sparc64_hpc_ace_double.c
new file mode 100644 (file)
index 0000000..7f31d05
--- /dev/null
@@ -0,0 +1,2408 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2012, by the GROMACS development team, led by
+ * David van der Spoel, Berk Hess, Erik Lindahl, and including many
+ * others, as listed in the AUTHORS file in the top-level source
+ * directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*
+ * Note: this file was generated by the GROMACS sparc64_hpc_ace_double kernel generator.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "kernelutil_sparc64_hpc_ace_double.h"
+
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecEwSh_VdwLJSh_GeomW3W3_VF_sparc64_hpc_ace_double
+ * Electrostatics interaction: Ewald
+ * VdW interaction:            LennardJones
+ * Geometry:                   Water3-Water3
+ * Calculate force/pot:        PotentialAndForce
+ */
+void
+nb_kernel_ElecEwSh_VdwLJSh_GeomW3W3_VF_sparc64_hpc_ace_double
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwjidx0A,vdwjidx0B;
+    _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    int              vdwjidx1A,vdwjidx1B;
+    _fjsp_v2r8       jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+    int              vdwjidx2A,vdwjidx2B;
+    _fjsp_v2r8       jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+    _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    _fjsp_v2r8       dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
+    _fjsp_v2r8       dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
+    _fjsp_v2r8       dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    _fjsp_v2r8       dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+    _fjsp_v2r8       dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+    _fjsp_v2r8       dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    _fjsp_v2r8       dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+    _fjsp_v2r8       dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+    _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
+    _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
+    _fjsp_v2r8       ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+    real             *ewtab;
+    _fjsp_v2r8       itab_tmp;
+    _fjsp_v2r8       dummy_mask,cutoff_mask;
+    _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+    _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+    union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+
+    sh_ewald         = gmx_fjsp_set1_v2r8(fr->ic->sh_ewald);
+    ewtab            = fr->ic->tabq_coul_FDV0;
+    ewtabscale       = gmx_fjsp_set1_v2r8(fr->ic->tabq_scale);
+    ewtabhalfspace   = gmx_fjsp_set1_v2r8(0.5/fr->ic->tabq_scale);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq0              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+0]));
+    iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+    iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+    vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+    jq0              = gmx_fjsp_set1_v2r8(charge[inr+0]);
+    jq1              = gmx_fjsp_set1_v2r8(charge[inr+1]);
+    jq2              = gmx_fjsp_set1_v2r8(charge[inr+2]);
+    vdwjidx0A        = 2*vdwtype[inr+0];
+    qq00             = _fjsp_mul_v2r8(iq0,jq0);
+    c6_00            = gmx_fjsp_set1_v2r8(vdwparam[vdwioffset0+vdwjidx0A]);
+    c12_00           = gmx_fjsp_set1_v2r8(vdwparam[vdwioffset0+vdwjidx0A+1]);
+    qq01             = _fjsp_mul_v2r8(iq0,jq1);
+    qq02             = _fjsp_mul_v2r8(iq0,jq2);
+    qq10             = _fjsp_mul_v2r8(iq1,jq0);
+    qq11             = _fjsp_mul_v2r8(iq1,jq1);
+    qq12             = _fjsp_mul_v2r8(iq1,jq2);
+    qq20             = _fjsp_mul_v2r8(iq2,jq0);
+    qq21             = _fjsp_mul_v2r8(iq2,jq1);
+    qq22             = _fjsp_mul_v2r8(iq2,jq2);
+
+    /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+    rcutoff_scalar   = fr->rcoulomb;
+    rcutoff          = gmx_fjsp_set1_v2r8(rcutoff_scalar);
+    rcutoff2         = _fjsp_mul_v2r8(rcutoff,rcutoff);
+
+    sh_vdw_invrcut6  = gmx_fjsp_set1_v2r8(fr->ic->sh_invrc6);
+    rvdw             = gmx_fjsp_set1_v2r8(fr->rvdw);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_fjsp_load_shift_and_3rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
+
+        fix0             = _fjsp_setzero_v2r8();
+        fiy0             = _fjsp_setzero_v2r8();
+        fiz0             = _fjsp_setzero_v2r8();
+        fix1             = _fjsp_setzero_v2r8();
+        fiy1             = _fjsp_setzero_v2r8();
+        fiz1             = _fjsp_setzero_v2r8();
+        fix2             = _fjsp_setzero_v2r8();
+        fiy2             = _fjsp_setzero_v2r8();
+        fiz2             = _fjsp_setzero_v2r8();
+
+        /* Reset potential sums */
+        velecsum         = _fjsp_setzero_v2r8();
+        vvdwsum          = _fjsp_setzero_v2r8();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_3rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx01             = _fjsp_sub_v2r8(ix0,jx1);
+            dy01             = _fjsp_sub_v2r8(iy0,jy1);
+            dz01             = _fjsp_sub_v2r8(iz0,jz1);
+            dx02             = _fjsp_sub_v2r8(ix0,jx2);
+            dy02             = _fjsp_sub_v2r8(iy0,jy2);
+            dz02             = _fjsp_sub_v2r8(iz0,jz2);
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx11             = _fjsp_sub_v2r8(ix1,jx1);
+            dy11             = _fjsp_sub_v2r8(iy1,jy1);
+            dz11             = _fjsp_sub_v2r8(iz1,jz1);
+            dx12             = _fjsp_sub_v2r8(ix1,jx2);
+            dy12             = _fjsp_sub_v2r8(iy1,jy2);
+            dz12             = _fjsp_sub_v2r8(iz1,jz2);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+            dx21             = _fjsp_sub_v2r8(ix2,jx1);
+            dy21             = _fjsp_sub_v2r8(iy2,jy1);
+            dz21             = _fjsp_sub_v2r8(iz2,jz1);
+            dx22             = _fjsp_sub_v2r8(ix2,jx2);
+            dy22             = _fjsp_sub_v2r8(iy2,jy2);
+            dz22             = _fjsp_sub_v2r8(iz2,jz2);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq01            = gmx_fjsp_calc_rsq_v2r8(dx01,dy01,dz01);
+            rsq02            = gmx_fjsp_calc_rsq_v2r8(dx02,dy02,dz02);
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+            rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+            rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+            rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv01           = gmx_fjsp_invsqrt_v2r8(rsq01);
+            rinv02           = gmx_fjsp_invsqrt_v2r8(rsq02);
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+            rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+            rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+            rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+            rinvsq01         = _fjsp_mul_v2r8(rinv01,rinv01);
+            rinvsq02         = _fjsp_mul_v2r8(rinv02,rinv02);
+            rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+            rinvsq11         = _fjsp_mul_v2r8(rinv11,rinv11);
+            rinvsq12         = _fjsp_mul_v2r8(rinv12,rinv12);
+            rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+            rinvsq21         = _fjsp_mul_v2r8(rinv21,rinv21);
+            rinvsq22         = _fjsp_mul_v2r8(rinv22,rinv22);
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+            fjx1             = _fjsp_setzero_v2r8();
+            fjy1             = _fjsp_setzero_v2r8();
+            fjz1             = _fjsp_setzero_v2r8();
+            fjx2             = _fjsp_setzero_v2r8();
+            fjy2             = _fjsp_setzero_v2r8();
+            fjz2             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
+            {
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r00,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq00,_fjsp_sub_v2r8(_fjsp_sub_v2r8(rinv00,sh_ewald),velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq00,rinv00),_fjsp_sub_v2r8(rinvsq00,felec));
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+            vvdw6            = _fjsp_mul_v2r8(c6_00,rinvsix);
+            vvdw12           = _fjsp_mul_v2r8(c12_00,_fjsp_mul_v2r8(rinvsix,rinvsix));
+            vvdw             = _fjsp_msub_v2r8(_fjsp_nmsub_v2r8(c12_00,_fjsp_mul_v2r8(sh_vdw_invrcut6,sh_vdw_invrcut6),vvdw12),one_twelfth,
+                                           _fjsp_mul_v2r8(_fjsp_nmsub_v2r8( c6_00,sh_vdw_invrcut6,vvdw6),one_sixth));
+            fvdw             = _fjsp_mul_v2r8(_fjsp_sub_v2r8(vvdw12,vvdw6),rinvsq00);
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+            vvdw             = _fjsp_and_v2r8(vvdw,cutoff_mask);
+            vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
+
+            fscal            = _fjsp_add_v2r8(felec,fvdw);
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq01,rcutoff2))
+            {
+
+            r01              = _fjsp_mul_v2r8(rsq01,rinv01);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r01,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq01,_fjsp_sub_v2r8(_fjsp_sub_v2r8(rinv01,sh_ewald),velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq01,rinv01),_fjsp_sub_v2r8(rinvsq01,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq01,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx01,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy01,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz01,fscal,fiz0);
+            
+            fjx1             = _fjsp_madd_v2r8(dx01,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy01,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz01,fscal,fjz1);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq02,rcutoff2))
+            {
+
+            r02              = _fjsp_mul_v2r8(rsq02,rinv02);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r02,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq02,_fjsp_sub_v2r8(_fjsp_sub_v2r8(rinv02,sh_ewald),velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq02,rinv02),_fjsp_sub_v2r8(rinvsq02,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq02,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx02,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy02,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz02,fscal,fiz0);
+            
+            fjx2             = _fjsp_madd_v2r8(dx02,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy02,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz02,fscal,fjz2);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq10,rcutoff2))
+            {
+
+            r10              = _fjsp_mul_v2r8(rsq10,rinv10);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r10,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq10,_fjsp_sub_v2r8(_fjsp_sub_v2r8(rinv10,sh_ewald),velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq10,rinv10),_fjsp_sub_v2r8(rinvsq10,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq10,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq11,rcutoff2))
+            {
+
+            r11              = _fjsp_mul_v2r8(rsq11,rinv11);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r11,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq11,_fjsp_sub_v2r8(_fjsp_sub_v2r8(rinv11,sh_ewald),velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq11,rinv11),_fjsp_sub_v2r8(rinvsq11,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq11,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+            
+            fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq12,rcutoff2))
+            {
+
+            r12              = _fjsp_mul_v2r8(rsq12,rinv12);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r12,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq12,_fjsp_sub_v2r8(_fjsp_sub_v2r8(rinv12,sh_ewald),velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq12,rinv12),_fjsp_sub_v2r8(rinvsq12,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq12,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+            
+            fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq20,rcutoff2))
+            {
+
+            r20              = _fjsp_mul_v2r8(rsq20,rinv20);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r20,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq20,_fjsp_sub_v2r8(_fjsp_sub_v2r8(rinv20,sh_ewald),velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq20,rinv20),_fjsp_sub_v2r8(rinvsq20,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq20,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq21,rcutoff2))
+            {
+
+            r21              = _fjsp_mul_v2r8(rsq21,rinv21);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r21,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq21,_fjsp_sub_v2r8(_fjsp_sub_v2r8(rinv21,sh_ewald),velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq21,rinv21),_fjsp_sub_v2r8(rinvsq21,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq21,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+            
+            fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq22,rcutoff2))
+            {
+
+            r22              = _fjsp_mul_v2r8(rsq22,rinv22);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r22,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq22,_fjsp_sub_v2r8(_fjsp_sub_v2r8(rinv22,sh_ewald),velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq22,rinv22),_fjsp_sub_v2r8(rinvsq22,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq22,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+            
+            fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+
+            }
+
+            gmx_fjsp_decrement_3rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+
+            /* Inner loop uses 459 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_3rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx01             = _fjsp_sub_v2r8(ix0,jx1);
+            dy01             = _fjsp_sub_v2r8(iy0,jy1);
+            dz01             = _fjsp_sub_v2r8(iz0,jz1);
+            dx02             = _fjsp_sub_v2r8(ix0,jx2);
+            dy02             = _fjsp_sub_v2r8(iy0,jy2);
+            dz02             = _fjsp_sub_v2r8(iz0,jz2);
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx11             = _fjsp_sub_v2r8(ix1,jx1);
+            dy11             = _fjsp_sub_v2r8(iy1,jy1);
+            dz11             = _fjsp_sub_v2r8(iz1,jz1);
+            dx12             = _fjsp_sub_v2r8(ix1,jx2);
+            dy12             = _fjsp_sub_v2r8(iy1,jy2);
+            dz12             = _fjsp_sub_v2r8(iz1,jz2);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+            dx21             = _fjsp_sub_v2r8(ix2,jx1);
+            dy21             = _fjsp_sub_v2r8(iy2,jy1);
+            dz21             = _fjsp_sub_v2r8(iz2,jz1);
+            dx22             = _fjsp_sub_v2r8(ix2,jx2);
+            dy22             = _fjsp_sub_v2r8(iy2,jy2);
+            dz22             = _fjsp_sub_v2r8(iz2,jz2);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq01            = gmx_fjsp_calc_rsq_v2r8(dx01,dy01,dz01);
+            rsq02            = gmx_fjsp_calc_rsq_v2r8(dx02,dy02,dz02);
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+            rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+            rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+            rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv01           = gmx_fjsp_invsqrt_v2r8(rsq01);
+            rinv02           = gmx_fjsp_invsqrt_v2r8(rsq02);
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+            rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+            rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+            rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+            rinvsq01         = _fjsp_mul_v2r8(rinv01,rinv01);
+            rinvsq02         = _fjsp_mul_v2r8(rinv02,rinv02);
+            rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+            rinvsq11         = _fjsp_mul_v2r8(rinv11,rinv11);
+            rinvsq12         = _fjsp_mul_v2r8(rinv12,rinv12);
+            rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+            rinvsq21         = _fjsp_mul_v2r8(rinv21,rinv21);
+            rinvsq22         = _fjsp_mul_v2r8(rinv22,rinv22);
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+            fjx1             = _fjsp_setzero_v2r8();
+            fjy1             = _fjsp_setzero_v2r8();
+            fjz1             = _fjsp_setzero_v2r8();
+            fjx2             = _fjsp_setzero_v2r8();
+            fjy2             = _fjsp_setzero_v2r8();
+            fjz2             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
+            {
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r00,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq00,_fjsp_sub_v2r8(_fjsp_sub_v2r8(rinv00,sh_ewald),velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq00,rinv00),_fjsp_sub_v2r8(rinvsq00,felec));
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+            vvdw6            = _fjsp_mul_v2r8(c6_00,rinvsix);
+            vvdw12           = _fjsp_mul_v2r8(c12_00,_fjsp_mul_v2r8(rinvsix,rinvsix));
+            vvdw             = _fjsp_msub_v2r8(_fjsp_nmsub_v2r8(c12_00,_fjsp_mul_v2r8(sh_vdw_invrcut6,sh_vdw_invrcut6),vvdw12),one_twelfth,
+                                           _fjsp_mul_v2r8(_fjsp_nmsub_v2r8( c6_00,sh_vdw_invrcut6,vvdw6),one_sixth));
+            fvdw             = _fjsp_mul_v2r8(_fjsp_sub_v2r8(vvdw12,vvdw6),rinvsq00);
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+            vvdw             = _fjsp_and_v2r8(vvdw,cutoff_mask);
+            vvdw             = _fjsp_unpacklo_v2r8(vvdw,_fjsp_setzero_v2r8());
+            vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
+
+            fscal            = _fjsp_add_v2r8(felec,fvdw);
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq01,rcutoff2))
+            {
+
+            r01              = _fjsp_mul_v2r8(rsq01,rinv01);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r01,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq01,_fjsp_sub_v2r8(_fjsp_sub_v2r8(rinv01,sh_ewald),velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq01,rinv01),_fjsp_sub_v2r8(rinvsq01,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq01,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx01,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy01,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz01,fscal,fiz0);
+            
+            fjx1             = _fjsp_madd_v2r8(dx01,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy01,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz01,fscal,fjz1);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq02,rcutoff2))
+            {
+
+            r02              = _fjsp_mul_v2r8(rsq02,rinv02);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r02,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq02,_fjsp_sub_v2r8(_fjsp_sub_v2r8(rinv02,sh_ewald),velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq02,rinv02),_fjsp_sub_v2r8(rinvsq02,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq02,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx02,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy02,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz02,fscal,fiz0);
+            
+            fjx2             = _fjsp_madd_v2r8(dx02,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy02,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz02,fscal,fjz2);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq10,rcutoff2))
+            {
+
+            r10              = _fjsp_mul_v2r8(rsq10,rinv10);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r10,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq10,_fjsp_sub_v2r8(_fjsp_sub_v2r8(rinv10,sh_ewald),velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq10,rinv10),_fjsp_sub_v2r8(rinvsq10,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq10,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq11,rcutoff2))
+            {
+
+            r11              = _fjsp_mul_v2r8(rsq11,rinv11);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r11,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq11,_fjsp_sub_v2r8(_fjsp_sub_v2r8(rinv11,sh_ewald),velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq11,rinv11),_fjsp_sub_v2r8(rinvsq11,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq11,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+            
+            fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq12,rcutoff2))
+            {
+
+            r12              = _fjsp_mul_v2r8(rsq12,rinv12);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r12,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq12,_fjsp_sub_v2r8(_fjsp_sub_v2r8(rinv12,sh_ewald),velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq12,rinv12),_fjsp_sub_v2r8(rinvsq12,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq12,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+            
+            fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq20,rcutoff2))
+            {
+
+            r20              = _fjsp_mul_v2r8(rsq20,rinv20);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r20,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq20,_fjsp_sub_v2r8(_fjsp_sub_v2r8(rinv20,sh_ewald),velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq20,rinv20),_fjsp_sub_v2r8(rinvsq20,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq20,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq21,rcutoff2))
+            {
+
+            r21              = _fjsp_mul_v2r8(rsq21,rinv21);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r21,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq21,_fjsp_sub_v2r8(_fjsp_sub_v2r8(rinv21,sh_ewald),velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq21,rinv21),_fjsp_sub_v2r8(rinvsq21,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq21,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+            
+            fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq22,rcutoff2))
+            {
+
+            r22              = _fjsp_mul_v2r8(rsq22,rinv22);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r22,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq22,_fjsp_sub_v2r8(_fjsp_sub_v2r8(rinv22,sh_ewald),velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq22,rinv22),_fjsp_sub_v2r8(rinvsq22,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq22,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+            
+            fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+
+            }
+
+            gmx_fjsp_decrement_3rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+
+            /* Inner loop uses 459 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_fjsp_update_iforce_3atom_swizzle_v2r8(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        ggid                        = gid[iidx];
+        /* Update potential energies */
+        gmx_fjsp_update_1pot_v2r8(velecsum,kernel_data->energygrp_elec+ggid);
+        gmx_fjsp_update_1pot_v2r8(vvdwsum,kernel_data->energygrp_vdw+ggid);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 20 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_VF,outeriter*20 + inneriter*459);
+}
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecEwSh_VdwLJSh_GeomW3W3_F_sparc64_hpc_ace_double
+ * Electrostatics interaction: Ewald
+ * VdW interaction:            LennardJones
+ * Geometry:                   Water3-Water3
+ * Calculate force/pot:        Force
+ */
+void
+nb_kernel_ElecEwSh_VdwLJSh_GeomW3W3_F_sparc64_hpc_ace_double
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwjidx0A,vdwjidx0B;
+    _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    int              vdwjidx1A,vdwjidx1B;
+    _fjsp_v2r8       jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+    int              vdwjidx2A,vdwjidx2B;
+    _fjsp_v2r8       jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+    _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    _fjsp_v2r8       dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
+    _fjsp_v2r8       dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
+    _fjsp_v2r8       dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    _fjsp_v2r8       dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+    _fjsp_v2r8       dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+    _fjsp_v2r8       dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    _fjsp_v2r8       dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+    _fjsp_v2r8       dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+    _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
+    _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
+    _fjsp_v2r8       ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+    real             *ewtab;
+    _fjsp_v2r8       itab_tmp;
+    _fjsp_v2r8       dummy_mask,cutoff_mask;
+    _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+    _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+    union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+
+    sh_ewald         = gmx_fjsp_set1_v2r8(fr->ic->sh_ewald);
+    ewtab            = fr->ic->tabq_coul_F;
+    ewtabscale       = gmx_fjsp_set1_v2r8(fr->ic->tabq_scale);
+    ewtabhalfspace   = gmx_fjsp_set1_v2r8(0.5/fr->ic->tabq_scale);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq0              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+0]));
+    iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+    iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+    vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+    jq0              = gmx_fjsp_set1_v2r8(charge[inr+0]);
+    jq1              = gmx_fjsp_set1_v2r8(charge[inr+1]);
+    jq2              = gmx_fjsp_set1_v2r8(charge[inr+2]);
+    vdwjidx0A        = 2*vdwtype[inr+0];
+    qq00             = _fjsp_mul_v2r8(iq0,jq0);
+    c6_00            = gmx_fjsp_set1_v2r8(vdwparam[vdwioffset0+vdwjidx0A]);
+    c12_00           = gmx_fjsp_set1_v2r8(vdwparam[vdwioffset0+vdwjidx0A+1]);
+    qq01             = _fjsp_mul_v2r8(iq0,jq1);
+    qq02             = _fjsp_mul_v2r8(iq0,jq2);
+    qq10             = _fjsp_mul_v2r8(iq1,jq0);
+    qq11             = _fjsp_mul_v2r8(iq1,jq1);
+    qq12             = _fjsp_mul_v2r8(iq1,jq2);
+    qq20             = _fjsp_mul_v2r8(iq2,jq0);
+    qq21             = _fjsp_mul_v2r8(iq2,jq1);
+    qq22             = _fjsp_mul_v2r8(iq2,jq2);
+
+    /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+    rcutoff_scalar   = fr->rcoulomb;
+    rcutoff          = gmx_fjsp_set1_v2r8(rcutoff_scalar);
+    rcutoff2         = _fjsp_mul_v2r8(rcutoff,rcutoff);
+
+    sh_vdw_invrcut6  = gmx_fjsp_set1_v2r8(fr->ic->sh_invrc6);
+    rvdw             = gmx_fjsp_set1_v2r8(fr->rvdw);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_fjsp_load_shift_and_3rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
+
+        fix0             = _fjsp_setzero_v2r8();
+        fiy0             = _fjsp_setzero_v2r8();
+        fiz0             = _fjsp_setzero_v2r8();
+        fix1             = _fjsp_setzero_v2r8();
+        fiy1             = _fjsp_setzero_v2r8();
+        fiz1             = _fjsp_setzero_v2r8();
+        fix2             = _fjsp_setzero_v2r8();
+        fiy2             = _fjsp_setzero_v2r8();
+        fiz2             = _fjsp_setzero_v2r8();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_3rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx01             = _fjsp_sub_v2r8(ix0,jx1);
+            dy01             = _fjsp_sub_v2r8(iy0,jy1);
+            dz01             = _fjsp_sub_v2r8(iz0,jz1);
+            dx02             = _fjsp_sub_v2r8(ix0,jx2);
+            dy02             = _fjsp_sub_v2r8(iy0,jy2);
+            dz02             = _fjsp_sub_v2r8(iz0,jz2);
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx11             = _fjsp_sub_v2r8(ix1,jx1);
+            dy11             = _fjsp_sub_v2r8(iy1,jy1);
+            dz11             = _fjsp_sub_v2r8(iz1,jz1);
+            dx12             = _fjsp_sub_v2r8(ix1,jx2);
+            dy12             = _fjsp_sub_v2r8(iy1,jy2);
+            dz12             = _fjsp_sub_v2r8(iz1,jz2);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+            dx21             = _fjsp_sub_v2r8(ix2,jx1);
+            dy21             = _fjsp_sub_v2r8(iy2,jy1);
+            dz21             = _fjsp_sub_v2r8(iz2,jz1);
+            dx22             = _fjsp_sub_v2r8(ix2,jx2);
+            dy22             = _fjsp_sub_v2r8(iy2,jy2);
+            dz22             = _fjsp_sub_v2r8(iz2,jz2);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq01            = gmx_fjsp_calc_rsq_v2r8(dx01,dy01,dz01);
+            rsq02            = gmx_fjsp_calc_rsq_v2r8(dx02,dy02,dz02);
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+            rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+            rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+            rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv01           = gmx_fjsp_invsqrt_v2r8(rsq01);
+            rinv02           = gmx_fjsp_invsqrt_v2r8(rsq02);
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+            rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+            rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+            rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+            rinvsq01         = _fjsp_mul_v2r8(rinv01,rinv01);
+            rinvsq02         = _fjsp_mul_v2r8(rinv02,rinv02);
+            rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+            rinvsq11         = _fjsp_mul_v2r8(rinv11,rinv11);
+            rinvsq12         = _fjsp_mul_v2r8(rinv12,rinv12);
+            rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+            rinvsq21         = _fjsp_mul_v2r8(rinv21,rinv21);
+            rinvsq22         = _fjsp_mul_v2r8(rinv22,rinv22);
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+            fjx1             = _fjsp_setzero_v2r8();
+            fjy1             = _fjsp_setzero_v2r8();
+            fjz1             = _fjsp_setzero_v2r8();
+            fjx2             = _fjsp_setzero_v2r8();
+            fjy2             = _fjsp_setzero_v2r8();
+            fjz2             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
+            {
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r00,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
+                                         &ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq00,rinv00),_fjsp_sub_v2r8(rinvsq00,felec));
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+            fvdw             = _fjsp_mul_v2r8(_fjsp_msub_v2r8(c12_00,rinvsix,c6_00),_fjsp_mul_v2r8(rinvsix,rinvsq00));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
+
+            fscal            = _fjsp_add_v2r8(felec,fvdw);
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq01,rcutoff2))
+            {
+
+            r01              = _fjsp_mul_v2r8(rsq01,rinv01);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r01,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
+                                         &ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq01,rinv01),_fjsp_sub_v2r8(rinvsq01,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq01,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx01,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy01,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz01,fscal,fiz0);
+            
+            fjx1             = _fjsp_madd_v2r8(dx01,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy01,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz01,fscal,fjz1);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq02,rcutoff2))
+            {
+
+            r02              = _fjsp_mul_v2r8(rsq02,rinv02);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r02,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
+                                         &ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq02,rinv02),_fjsp_sub_v2r8(rinvsq02,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq02,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx02,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy02,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz02,fscal,fiz0);
+            
+            fjx2             = _fjsp_madd_v2r8(dx02,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy02,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz02,fscal,fjz2);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq10,rcutoff2))
+            {
+
+            r10              = _fjsp_mul_v2r8(rsq10,rinv10);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r10,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
+                                         &ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq10,rinv10),_fjsp_sub_v2r8(rinvsq10,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq10,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq11,rcutoff2))
+            {
+
+            r11              = _fjsp_mul_v2r8(rsq11,rinv11);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r11,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
+                                         &ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq11,rinv11),_fjsp_sub_v2r8(rinvsq11,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq11,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+            
+            fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq12,rcutoff2))
+            {
+
+            r12              = _fjsp_mul_v2r8(rsq12,rinv12);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r12,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
+                                         &ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq12,rinv12),_fjsp_sub_v2r8(rinvsq12,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq12,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+            
+            fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq20,rcutoff2))
+            {
+
+            r20              = _fjsp_mul_v2r8(rsq20,rinv20);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r20,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
+                                         &ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq20,rinv20),_fjsp_sub_v2r8(rinvsq20,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq20,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq21,rcutoff2))
+            {
+
+            r21              = _fjsp_mul_v2r8(rsq21,rinv21);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r21,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
+                                         &ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq21,rinv21),_fjsp_sub_v2r8(rinvsq21,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq21,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+            
+            fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq22,rcutoff2))
+            {
+
+            r22              = _fjsp_mul_v2r8(rsq22,rinv22);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r22,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
+                                         &ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq22,rinv22),_fjsp_sub_v2r8(rinvsq22,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq22,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+            
+            fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+
+            }
+
+            gmx_fjsp_decrement_3rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+
+            /* Inner loop uses 385 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_3rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx01             = _fjsp_sub_v2r8(ix0,jx1);
+            dy01             = _fjsp_sub_v2r8(iy0,jy1);
+            dz01             = _fjsp_sub_v2r8(iz0,jz1);
+            dx02             = _fjsp_sub_v2r8(ix0,jx2);
+            dy02             = _fjsp_sub_v2r8(iy0,jy2);
+            dz02             = _fjsp_sub_v2r8(iz0,jz2);
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx11             = _fjsp_sub_v2r8(ix1,jx1);
+            dy11             = _fjsp_sub_v2r8(iy1,jy1);
+            dz11             = _fjsp_sub_v2r8(iz1,jz1);
+            dx12             = _fjsp_sub_v2r8(ix1,jx2);
+            dy12             = _fjsp_sub_v2r8(iy1,jy2);
+            dz12             = _fjsp_sub_v2r8(iz1,jz2);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+            dx21             = _fjsp_sub_v2r8(ix2,jx1);
+            dy21             = _fjsp_sub_v2r8(iy2,jy1);
+            dz21             = _fjsp_sub_v2r8(iz2,jz1);
+            dx22             = _fjsp_sub_v2r8(ix2,jx2);
+            dy22             = _fjsp_sub_v2r8(iy2,jy2);
+            dz22             = _fjsp_sub_v2r8(iz2,jz2);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq01            = gmx_fjsp_calc_rsq_v2r8(dx01,dy01,dz01);
+            rsq02            = gmx_fjsp_calc_rsq_v2r8(dx02,dy02,dz02);
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+            rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+            rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+            rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv01           = gmx_fjsp_invsqrt_v2r8(rsq01);
+            rinv02           = gmx_fjsp_invsqrt_v2r8(rsq02);
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+            rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+            rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+            rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+            rinvsq01         = _fjsp_mul_v2r8(rinv01,rinv01);
+            rinvsq02         = _fjsp_mul_v2r8(rinv02,rinv02);
+            rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+            rinvsq11         = _fjsp_mul_v2r8(rinv11,rinv11);
+            rinvsq12         = _fjsp_mul_v2r8(rinv12,rinv12);
+            rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+            rinvsq21         = _fjsp_mul_v2r8(rinv21,rinv21);
+            rinvsq22         = _fjsp_mul_v2r8(rinv22,rinv22);
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+            fjx1             = _fjsp_setzero_v2r8();
+            fjy1             = _fjsp_setzero_v2r8();
+            fjz1             = _fjsp_setzero_v2r8();
+            fjx2             = _fjsp_setzero_v2r8();
+            fjy2             = _fjsp_setzero_v2r8();
+            fjz2             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
+            {
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r00,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq00,rinv00),_fjsp_sub_v2r8(rinvsq00,felec));
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+            fvdw             = _fjsp_mul_v2r8(_fjsp_msub_v2r8(c12_00,rinvsix,c6_00),_fjsp_mul_v2r8(rinvsix,rinvsq00));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
+
+            fscal            = _fjsp_add_v2r8(felec,fvdw);
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq01,rcutoff2))
+            {
+
+            r01              = _fjsp_mul_v2r8(rsq01,rinv01);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r01,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq01,rinv01),_fjsp_sub_v2r8(rinvsq01,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq01,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx01,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy01,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz01,fscal,fiz0);
+            
+            fjx1             = _fjsp_madd_v2r8(dx01,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy01,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz01,fscal,fjz1);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq02,rcutoff2))
+            {
+
+            r02              = _fjsp_mul_v2r8(rsq02,rinv02);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r02,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq02,rinv02),_fjsp_sub_v2r8(rinvsq02,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq02,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx02,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy02,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz02,fscal,fiz0);
+            
+            fjx2             = _fjsp_madd_v2r8(dx02,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy02,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz02,fscal,fjz2);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq10,rcutoff2))
+            {
+
+            r10              = _fjsp_mul_v2r8(rsq10,rinv10);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r10,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq10,rinv10),_fjsp_sub_v2r8(rinvsq10,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq10,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq11,rcutoff2))
+            {
+
+            r11              = _fjsp_mul_v2r8(rsq11,rinv11);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r11,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq11,rinv11),_fjsp_sub_v2r8(rinvsq11,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq11,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+            
+            fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq12,rcutoff2))
+            {
+
+            r12              = _fjsp_mul_v2r8(rsq12,rinv12);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r12,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq12,rinv12),_fjsp_sub_v2r8(rinvsq12,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq12,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+            
+            fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq20,rcutoff2))
+            {
+
+            r20              = _fjsp_mul_v2r8(rsq20,rinv20);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r20,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq20,rinv20),_fjsp_sub_v2r8(rinvsq20,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq20,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq21,rcutoff2))
+            {
+
+            r21              = _fjsp_mul_v2r8(rsq21,rinv21);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r21,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq21,rinv21),_fjsp_sub_v2r8(rinvsq21,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq21,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+            
+            fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq22,rcutoff2))
+            {
+
+            r22              = _fjsp_mul_v2r8(rsq22,rinv22);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r22,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq22,rinv22),_fjsp_sub_v2r8(rinvsq22,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq22,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+            
+            fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+
+            }
+
+            gmx_fjsp_decrement_3rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+
+            /* Inner loop uses 385 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_fjsp_update_iforce_3atom_swizzle_v2r8(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 18 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_F,outeriter*18 + inneriter*385);
+}
diff --git a/src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecEwSh_VdwLJSh_GeomW4P1_sparc64_hpc_ace_double.c b/src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecEwSh_VdwLJSh_GeomW4P1_sparc64_hpc_ace_double.c
new file mode 100644 (file)
index 0000000..0e61a06
--- /dev/null
@@ -0,0 +1,1312 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2012, by the GROMACS development team, led by
+ * David van der Spoel, Berk Hess, Erik Lindahl, and including many
+ * others, as listed in the AUTHORS file in the top-level source
+ * directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*
+ * Note: this file was generated by the GROMACS sparc64_hpc_ace_double kernel generator.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "kernelutil_sparc64_hpc_ace_double.h"
+
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecEwSh_VdwLJSh_GeomW4P1_VF_sparc64_hpc_ace_double
+ * Electrostatics interaction: Ewald
+ * VdW interaction:            LennardJones
+ * Geometry:                   Water4-Particle
+ * Calculate force/pot:        PotentialAndForce
+ */
+void
+nb_kernel_ElecEwSh_VdwLJSh_GeomW4P1_VF_sparc64_hpc_ace_double
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwioffset3;
+    _fjsp_v2r8       ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+    int              vdwjidx0A,vdwjidx0B;
+    _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    _fjsp_v2r8       dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    _fjsp_v2r8       dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    _fjsp_v2r8       dx30,dy30,dz30,rsq30,rinv30,rinvsq30,r30,qq30,c6_30,c12_30;
+    _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
+    _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
+    _fjsp_v2r8       ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+    real             *ewtab;
+    _fjsp_v2r8       itab_tmp;
+    _fjsp_v2r8       dummy_mask,cutoff_mask;
+    _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+    _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+    union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+
+    sh_ewald         = gmx_fjsp_set1_v2r8(fr->ic->sh_ewald);
+    ewtab            = fr->ic->tabq_coul_FDV0;
+    ewtabscale       = gmx_fjsp_set1_v2r8(fr->ic->tabq_scale);
+    ewtabhalfspace   = gmx_fjsp_set1_v2r8(0.5/fr->ic->tabq_scale);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+    iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+    iq3              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+3]));
+    vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+    /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+    rcutoff_scalar   = fr->rcoulomb;
+    rcutoff          = gmx_fjsp_set1_v2r8(rcutoff_scalar);
+    rcutoff2         = _fjsp_mul_v2r8(rcutoff,rcutoff);
+
+    sh_vdw_invrcut6  = gmx_fjsp_set1_v2r8(fr->ic->sh_invrc6);
+    rvdw             = gmx_fjsp_set1_v2r8(fr->rvdw);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_fjsp_load_shift_and_4rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
+
+        fix0             = _fjsp_setzero_v2r8();
+        fiy0             = _fjsp_setzero_v2r8();
+        fiz0             = _fjsp_setzero_v2r8();
+        fix1             = _fjsp_setzero_v2r8();
+        fiy1             = _fjsp_setzero_v2r8();
+        fiz1             = _fjsp_setzero_v2r8();
+        fix2             = _fjsp_setzero_v2r8();
+        fiy2             = _fjsp_setzero_v2r8();
+        fiz2             = _fjsp_setzero_v2r8();
+        fix3             = _fjsp_setzero_v2r8();
+        fiy3             = _fjsp_setzero_v2r8();
+        fiz3             = _fjsp_setzero_v2r8();
+
+        /* Reset potential sums */
+        velecsum         = _fjsp_setzero_v2r8();
+        vvdwsum          = _fjsp_setzero_v2r8();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+            dx30             = _fjsp_sub_v2r8(ix3,jx0);
+            dy30             = _fjsp_sub_v2r8(iy3,jy0);
+            dz30             = _fjsp_sub_v2r8(iz3,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+            rsq30            = gmx_fjsp_calc_rsq_v2r8(dx30,dy30,dz30);
+
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+            rinv30           = gmx_fjsp_invsqrt_v2r8(rsq30);
+
+            rinvsq00         = gmx_fjsp_inv_v2r8(rsq00);
+            rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+            rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+            rinvsq30         = _fjsp_mul_v2r8(rinv30,rinv30);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_fjsp_load_2real_swizzle_v2r8(charge+jnrA+0,charge+jnrB+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
+            {
+
+            /* Compute parameters for interactions between i and j atoms */
+            gmx_fjsp_load_2pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,&c6_00,&c12_00);
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+            vvdw6            = _fjsp_mul_v2r8(c6_00,rinvsix);
+            vvdw12           = _fjsp_mul_v2r8(c12_00,_fjsp_mul_v2r8(rinvsix,rinvsix));
+            vvdw             = _fjsp_msub_v2r8(_fjsp_nmsub_v2r8(c12_00,_fjsp_mul_v2r8(sh_vdw_invrcut6,sh_vdw_invrcut6),vvdw12),one_twelfth,
+                                           _fjsp_mul_v2r8(_fjsp_nmsub_v2r8( c6_00,sh_vdw_invrcut6,vvdw6),one_sixth));
+            fvdw             = _fjsp_mul_v2r8(_fjsp_sub_v2r8(vvdw12,vvdw6),rinvsq00);
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            vvdw             = _fjsp_and_v2r8(vvdw,cutoff_mask);
+            vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
+
+            fscal            = fvdw;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq10,rcutoff2))
+            {
+
+            r10              = _fjsp_mul_v2r8(rsq10,rinv10);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _fjsp_mul_v2r8(iq1,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r10,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq10,_fjsp_sub_v2r8(_fjsp_sub_v2r8(rinv10,sh_ewald),velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq10,rinv10),_fjsp_sub_v2r8(rinvsq10,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq10,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq20,rcutoff2))
+            {
+
+            r20              = _fjsp_mul_v2r8(rsq20,rinv20);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _fjsp_mul_v2r8(iq2,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r20,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq20,_fjsp_sub_v2r8(_fjsp_sub_v2r8(rinv20,sh_ewald),velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq20,rinv20),_fjsp_sub_v2r8(rinvsq20,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq20,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq30,rcutoff2))
+            {
+
+            r30              = _fjsp_mul_v2r8(rsq30,rinv30);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq30             = _fjsp_mul_v2r8(iq3,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r30,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq30,_fjsp_sub_v2r8(_fjsp_sub_v2r8(rinv30,sh_ewald),velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq30,rinv30),_fjsp_sub_v2r8(rinvsq30,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq30,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx30,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy30,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz30,fscal,fiz3);
+            
+            fjx0             = _fjsp_madd_v2r8(dx30,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy30,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz30,fscal,fjz0);
+
+            }
+
+            gmx_fjsp_decrement_1rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 194 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+            dx30             = _fjsp_sub_v2r8(ix3,jx0);
+            dy30             = _fjsp_sub_v2r8(iy3,jy0);
+            dz30             = _fjsp_sub_v2r8(iz3,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+            rsq30            = gmx_fjsp_calc_rsq_v2r8(dx30,dy30,dz30);
+
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+            rinv30           = gmx_fjsp_invsqrt_v2r8(rsq30);
+
+            rinvsq00         = gmx_fjsp_inv_v2r8(rsq00);
+            rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+            rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+            rinvsq30         = _fjsp_mul_v2r8(rinv30,rinv30);
+
+            /* Load parameters for j particles */
+            jq0              = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),charge+jnrA+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
+            {
+
+            /* Compute parameters for interactions between i and j atoms */
+            gmx_fjsp_load_1pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,&c6_00,&c12_00);
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+            vvdw6            = _fjsp_mul_v2r8(c6_00,rinvsix);
+            vvdw12           = _fjsp_mul_v2r8(c12_00,_fjsp_mul_v2r8(rinvsix,rinvsix));
+            vvdw             = _fjsp_msub_v2r8(_fjsp_nmsub_v2r8(c12_00,_fjsp_mul_v2r8(sh_vdw_invrcut6,sh_vdw_invrcut6),vvdw12),one_twelfth,
+                                           _fjsp_mul_v2r8(_fjsp_nmsub_v2r8( c6_00,sh_vdw_invrcut6,vvdw6),one_sixth));
+            fvdw             = _fjsp_mul_v2r8(_fjsp_sub_v2r8(vvdw12,vvdw6),rinvsq00);
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            vvdw             = _fjsp_and_v2r8(vvdw,cutoff_mask);
+            vvdw             = _fjsp_unpacklo_v2r8(vvdw,_fjsp_setzero_v2r8());
+            vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
+
+            fscal            = fvdw;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq10,rcutoff2))
+            {
+
+            r10              = _fjsp_mul_v2r8(rsq10,rinv10);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _fjsp_mul_v2r8(iq1,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r10,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq10,_fjsp_sub_v2r8(_fjsp_sub_v2r8(rinv10,sh_ewald),velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq10,rinv10),_fjsp_sub_v2r8(rinvsq10,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq10,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq20,rcutoff2))
+            {
+
+            r20              = _fjsp_mul_v2r8(rsq20,rinv20);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _fjsp_mul_v2r8(iq2,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r20,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq20,_fjsp_sub_v2r8(_fjsp_sub_v2r8(rinv20,sh_ewald),velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq20,rinv20),_fjsp_sub_v2r8(rinvsq20,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq20,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq30,rcutoff2))
+            {
+
+            r30              = _fjsp_mul_v2r8(rsq30,rinv30);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq30             = _fjsp_mul_v2r8(iq3,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r30,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq30,_fjsp_sub_v2r8(_fjsp_sub_v2r8(rinv30,sh_ewald),velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq30,rinv30),_fjsp_sub_v2r8(rinvsq30,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq30,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx30,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy30,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz30,fscal,fiz3);
+            
+            fjx0             = _fjsp_madd_v2r8(dx30,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy30,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz30,fscal,fjz0);
+
+            }
+
+            gmx_fjsp_decrement_1rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 194 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_fjsp_update_iforce_4atom_swizzle_v2r8(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        ggid                        = gid[iidx];
+        /* Update potential energies */
+        gmx_fjsp_update_1pot_v2r8(velecsum,kernel_data->energygrp_elec+ggid);
+        gmx_fjsp_update_1pot_v2r8(vvdwsum,kernel_data->energygrp_vdw+ggid);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 26 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4_VF,outeriter*26 + inneriter*194);
+}
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecEwSh_VdwLJSh_GeomW4P1_F_sparc64_hpc_ace_double
+ * Electrostatics interaction: Ewald
+ * VdW interaction:            LennardJones
+ * Geometry:                   Water4-Particle
+ * Calculate force/pot:        Force
+ */
+void
+nb_kernel_ElecEwSh_VdwLJSh_GeomW4P1_F_sparc64_hpc_ace_double
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwioffset3;
+    _fjsp_v2r8       ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+    int              vdwjidx0A,vdwjidx0B;
+    _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    _fjsp_v2r8       dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    _fjsp_v2r8       dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    _fjsp_v2r8       dx30,dy30,dz30,rsq30,rinv30,rinvsq30,r30,qq30,c6_30,c12_30;
+    _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
+    _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
+    _fjsp_v2r8       ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+    real             *ewtab;
+    _fjsp_v2r8       itab_tmp;
+    _fjsp_v2r8       dummy_mask,cutoff_mask;
+    _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+    _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+    union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+
+    sh_ewald         = gmx_fjsp_set1_v2r8(fr->ic->sh_ewald);
+    ewtab            = fr->ic->tabq_coul_F;
+    ewtabscale       = gmx_fjsp_set1_v2r8(fr->ic->tabq_scale);
+    ewtabhalfspace   = gmx_fjsp_set1_v2r8(0.5/fr->ic->tabq_scale);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+    iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+    iq3              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+3]));
+    vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+    /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+    rcutoff_scalar   = fr->rcoulomb;
+    rcutoff          = gmx_fjsp_set1_v2r8(rcutoff_scalar);
+    rcutoff2         = _fjsp_mul_v2r8(rcutoff,rcutoff);
+
+    sh_vdw_invrcut6  = gmx_fjsp_set1_v2r8(fr->ic->sh_invrc6);
+    rvdw             = gmx_fjsp_set1_v2r8(fr->rvdw);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_fjsp_load_shift_and_4rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
+
+        fix0             = _fjsp_setzero_v2r8();
+        fiy0             = _fjsp_setzero_v2r8();
+        fiz0             = _fjsp_setzero_v2r8();
+        fix1             = _fjsp_setzero_v2r8();
+        fiy1             = _fjsp_setzero_v2r8();
+        fiz1             = _fjsp_setzero_v2r8();
+        fix2             = _fjsp_setzero_v2r8();
+        fiy2             = _fjsp_setzero_v2r8();
+        fiz2             = _fjsp_setzero_v2r8();
+        fix3             = _fjsp_setzero_v2r8();
+        fiy3             = _fjsp_setzero_v2r8();
+        fiz3             = _fjsp_setzero_v2r8();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+            dx30             = _fjsp_sub_v2r8(ix3,jx0);
+            dy30             = _fjsp_sub_v2r8(iy3,jy0);
+            dz30             = _fjsp_sub_v2r8(iz3,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+            rsq30            = gmx_fjsp_calc_rsq_v2r8(dx30,dy30,dz30);
+
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+            rinv30           = gmx_fjsp_invsqrt_v2r8(rsq30);
+
+            rinvsq00         = gmx_fjsp_inv_v2r8(rsq00);
+            rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+            rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+            rinvsq30         = _fjsp_mul_v2r8(rinv30,rinv30);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_fjsp_load_2real_swizzle_v2r8(charge+jnrA+0,charge+jnrB+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
+            {
+
+            /* Compute parameters for interactions between i and j atoms */
+            gmx_fjsp_load_2pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,&c6_00,&c12_00);
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+            fvdw             = _fjsp_mul_v2r8(_fjsp_msub_v2r8(c12_00,rinvsix,c6_00),_fjsp_mul_v2r8(rinvsix,rinvsq00));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
+
+            fscal            = fvdw;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq10,rcutoff2))
+            {
+
+            r10              = _fjsp_mul_v2r8(rsq10,rinv10);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _fjsp_mul_v2r8(iq1,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r10,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
+                                         &ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq10,rinv10),_fjsp_sub_v2r8(rinvsq10,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq10,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq20,rcutoff2))
+            {
+
+            r20              = _fjsp_mul_v2r8(rsq20,rinv20);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _fjsp_mul_v2r8(iq2,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r20,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
+                                         &ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq20,rinv20),_fjsp_sub_v2r8(rinvsq20,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq20,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq30,rcutoff2))
+            {
+
+            r30              = _fjsp_mul_v2r8(rsq30,rinv30);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq30             = _fjsp_mul_v2r8(iq3,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r30,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
+                                         &ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq30,rinv30),_fjsp_sub_v2r8(rinvsq30,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq30,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx30,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy30,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz30,fscal,fiz3);
+            
+            fjx0             = _fjsp_madd_v2r8(dx30,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy30,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz30,fscal,fjz0);
+
+            }
+
+            gmx_fjsp_decrement_1rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 162 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+            dx30             = _fjsp_sub_v2r8(ix3,jx0);
+            dy30             = _fjsp_sub_v2r8(iy3,jy0);
+            dz30             = _fjsp_sub_v2r8(iz3,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+            rsq30            = gmx_fjsp_calc_rsq_v2r8(dx30,dy30,dz30);
+
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+            rinv30           = gmx_fjsp_invsqrt_v2r8(rsq30);
+
+            rinvsq00         = gmx_fjsp_inv_v2r8(rsq00);
+            rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+            rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+            rinvsq30         = _fjsp_mul_v2r8(rinv30,rinv30);
+
+            /* Load parameters for j particles */
+            jq0              = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),charge+jnrA+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
+            {
+
+            /* Compute parameters for interactions between i and j atoms */
+            gmx_fjsp_load_1pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,&c6_00,&c12_00);
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+            fvdw             = _fjsp_mul_v2r8(_fjsp_msub_v2r8(c12_00,rinvsix,c6_00),_fjsp_mul_v2r8(rinvsix,rinvsq00));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
+
+            fscal            = fvdw;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq10,rcutoff2))
+            {
+
+            r10              = _fjsp_mul_v2r8(rsq10,rinv10);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _fjsp_mul_v2r8(iq1,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r10,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq10,rinv10),_fjsp_sub_v2r8(rinvsq10,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq10,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq20,rcutoff2))
+            {
+
+            r20              = _fjsp_mul_v2r8(rsq20,rinv20);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _fjsp_mul_v2r8(iq2,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r20,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq20,rinv20),_fjsp_sub_v2r8(rinvsq20,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq20,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq30,rcutoff2))
+            {
+
+            r30              = _fjsp_mul_v2r8(rsq30,rinv30);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq30             = _fjsp_mul_v2r8(iq3,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r30,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq30,rinv30),_fjsp_sub_v2r8(rinvsq30,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq30,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx30,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy30,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz30,fscal,fiz3);
+            
+            fjx0             = _fjsp_madd_v2r8(dx30,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy30,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz30,fscal,fjz0);
+
+            }
+
+            gmx_fjsp_decrement_1rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 162 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_fjsp_update_iforce_4atom_swizzle_v2r8(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 24 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4_F,outeriter*24 + inneriter*162);
+}
diff --git a/src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecEwSh_VdwLJSh_GeomW4W4_sparc64_hpc_ace_double.c b/src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecEwSh_VdwLJSh_GeomW4W4_sparc64_hpc_ace_double.c
new file mode 100644 (file)
index 0000000..54aa1f9
--- /dev/null
@@ -0,0 +1,2564 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2012, by the GROMACS development team, led by
+ * David van der Spoel, Berk Hess, Erik Lindahl, and including many
+ * others, as listed in the AUTHORS file in the top-level source
+ * directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*
+ * Note: this file was generated by the GROMACS sparc64_hpc_ace_double kernel generator.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "kernelutil_sparc64_hpc_ace_double.h"
+
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecEwSh_VdwLJSh_GeomW4W4_VF_sparc64_hpc_ace_double
+ * Electrostatics interaction: Ewald
+ * VdW interaction:            LennardJones
+ * Geometry:                   Water4-Water4
+ * Calculate force/pot:        PotentialAndForce
+ */
+void
+nb_kernel_ElecEwSh_VdwLJSh_GeomW4W4_VF_sparc64_hpc_ace_double
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwioffset3;
+    _fjsp_v2r8       ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+    int              vdwjidx0A,vdwjidx0B;
+    _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    int              vdwjidx1A,vdwjidx1B;
+    _fjsp_v2r8       jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+    int              vdwjidx2A,vdwjidx2B;
+    _fjsp_v2r8       jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+    int              vdwjidx3A,vdwjidx3B;
+    _fjsp_v2r8       jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
+    _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    _fjsp_v2r8       dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+    _fjsp_v2r8       dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+    _fjsp_v2r8       dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
+    _fjsp_v2r8       dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+    _fjsp_v2r8       dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+    _fjsp_v2r8       dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
+    _fjsp_v2r8       dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
+    _fjsp_v2r8       dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
+    _fjsp_v2r8       dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
+    _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
+    _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
+    _fjsp_v2r8       ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+    real             *ewtab;
+    _fjsp_v2r8       itab_tmp;
+    _fjsp_v2r8       dummy_mask,cutoff_mask;
+    _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+    _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+    union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+
+    sh_ewald         = gmx_fjsp_set1_v2r8(fr->ic->sh_ewald);
+    ewtab            = fr->ic->tabq_coul_FDV0;
+    ewtabscale       = gmx_fjsp_set1_v2r8(fr->ic->tabq_scale);
+    ewtabhalfspace   = gmx_fjsp_set1_v2r8(0.5/fr->ic->tabq_scale);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+    iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+    iq3              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+3]));
+    vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+    jq1              = gmx_fjsp_set1_v2r8(charge[inr+1]);
+    jq2              = gmx_fjsp_set1_v2r8(charge[inr+2]);
+    jq3              = gmx_fjsp_set1_v2r8(charge[inr+3]);
+    vdwjidx0A        = 2*vdwtype[inr+0];
+    c6_00            = gmx_fjsp_set1_v2r8(vdwparam[vdwioffset0+vdwjidx0A]);
+    c12_00           = gmx_fjsp_set1_v2r8(vdwparam[vdwioffset0+vdwjidx0A+1]);
+    qq11             = _fjsp_mul_v2r8(iq1,jq1);
+    qq12             = _fjsp_mul_v2r8(iq1,jq2);
+    qq13             = _fjsp_mul_v2r8(iq1,jq3);
+    qq21             = _fjsp_mul_v2r8(iq2,jq1);
+    qq22             = _fjsp_mul_v2r8(iq2,jq2);
+    qq23             = _fjsp_mul_v2r8(iq2,jq3);
+    qq31             = _fjsp_mul_v2r8(iq3,jq1);
+    qq32             = _fjsp_mul_v2r8(iq3,jq2);
+    qq33             = _fjsp_mul_v2r8(iq3,jq3);
+
+    /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+    rcutoff_scalar   = fr->rcoulomb;
+    rcutoff          = gmx_fjsp_set1_v2r8(rcutoff_scalar);
+    rcutoff2         = _fjsp_mul_v2r8(rcutoff,rcutoff);
+
+    sh_vdw_invrcut6  = gmx_fjsp_set1_v2r8(fr->ic->sh_invrc6);
+    rvdw             = gmx_fjsp_set1_v2r8(fr->rvdw);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_fjsp_load_shift_and_4rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
+
+        fix0             = _fjsp_setzero_v2r8();
+        fiy0             = _fjsp_setzero_v2r8();
+        fiz0             = _fjsp_setzero_v2r8();
+        fix1             = _fjsp_setzero_v2r8();
+        fiy1             = _fjsp_setzero_v2r8();
+        fiz1             = _fjsp_setzero_v2r8();
+        fix2             = _fjsp_setzero_v2r8();
+        fiy2             = _fjsp_setzero_v2r8();
+        fiz2             = _fjsp_setzero_v2r8();
+        fix3             = _fjsp_setzero_v2r8();
+        fiy3             = _fjsp_setzero_v2r8();
+        fiz3             = _fjsp_setzero_v2r8();
+
+        /* Reset potential sums */
+        velecsum         = _fjsp_setzero_v2r8();
+        vvdwsum          = _fjsp_setzero_v2r8();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_4rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
+                                              &jy2,&jz2,&jx3,&jy3,&jz3);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx11             = _fjsp_sub_v2r8(ix1,jx1);
+            dy11             = _fjsp_sub_v2r8(iy1,jy1);
+            dz11             = _fjsp_sub_v2r8(iz1,jz1);
+            dx12             = _fjsp_sub_v2r8(ix1,jx2);
+            dy12             = _fjsp_sub_v2r8(iy1,jy2);
+            dz12             = _fjsp_sub_v2r8(iz1,jz2);
+            dx13             = _fjsp_sub_v2r8(ix1,jx3);
+            dy13             = _fjsp_sub_v2r8(iy1,jy3);
+            dz13             = _fjsp_sub_v2r8(iz1,jz3);
+            dx21             = _fjsp_sub_v2r8(ix2,jx1);
+            dy21             = _fjsp_sub_v2r8(iy2,jy1);
+            dz21             = _fjsp_sub_v2r8(iz2,jz1);
+            dx22             = _fjsp_sub_v2r8(ix2,jx2);
+            dy22             = _fjsp_sub_v2r8(iy2,jy2);
+            dz22             = _fjsp_sub_v2r8(iz2,jz2);
+            dx23             = _fjsp_sub_v2r8(ix2,jx3);
+            dy23             = _fjsp_sub_v2r8(iy2,jy3);
+            dz23             = _fjsp_sub_v2r8(iz2,jz3);
+            dx31             = _fjsp_sub_v2r8(ix3,jx1);
+            dy31             = _fjsp_sub_v2r8(iy3,jy1);
+            dz31             = _fjsp_sub_v2r8(iz3,jz1);
+            dx32             = _fjsp_sub_v2r8(ix3,jx2);
+            dy32             = _fjsp_sub_v2r8(iy3,jy2);
+            dz32             = _fjsp_sub_v2r8(iz3,jz2);
+            dx33             = _fjsp_sub_v2r8(ix3,jx3);
+            dy33             = _fjsp_sub_v2r8(iy3,jy3);
+            dz33             = _fjsp_sub_v2r8(iz3,jz3);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+            rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+            rsq13            = gmx_fjsp_calc_rsq_v2r8(dx13,dy13,dz13);
+            rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+            rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+            rsq23            = gmx_fjsp_calc_rsq_v2r8(dx23,dy23,dz23);
+            rsq31            = gmx_fjsp_calc_rsq_v2r8(dx31,dy31,dz31);
+            rsq32            = gmx_fjsp_calc_rsq_v2r8(dx32,dy32,dz32);
+            rsq33            = gmx_fjsp_calc_rsq_v2r8(dx33,dy33,dz33);
+
+            rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+            rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+            rinv13           = gmx_fjsp_invsqrt_v2r8(rsq13);
+            rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+            rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+            rinv23           = gmx_fjsp_invsqrt_v2r8(rsq23);
+            rinv31           = gmx_fjsp_invsqrt_v2r8(rsq31);
+            rinv32           = gmx_fjsp_invsqrt_v2r8(rsq32);
+            rinv33           = gmx_fjsp_invsqrt_v2r8(rsq33);
+
+            rinvsq00         = gmx_fjsp_inv_v2r8(rsq00);
+            rinvsq11         = _fjsp_mul_v2r8(rinv11,rinv11);
+            rinvsq12         = _fjsp_mul_v2r8(rinv12,rinv12);
+            rinvsq13         = _fjsp_mul_v2r8(rinv13,rinv13);
+            rinvsq21         = _fjsp_mul_v2r8(rinv21,rinv21);
+            rinvsq22         = _fjsp_mul_v2r8(rinv22,rinv22);
+            rinvsq23         = _fjsp_mul_v2r8(rinv23,rinv23);
+            rinvsq31         = _fjsp_mul_v2r8(rinv31,rinv31);
+            rinvsq32         = _fjsp_mul_v2r8(rinv32,rinv32);
+            rinvsq33         = _fjsp_mul_v2r8(rinv33,rinv33);
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+            fjx1             = _fjsp_setzero_v2r8();
+            fjy1             = _fjsp_setzero_v2r8();
+            fjz1             = _fjsp_setzero_v2r8();
+            fjx2             = _fjsp_setzero_v2r8();
+            fjy2             = _fjsp_setzero_v2r8();
+            fjz2             = _fjsp_setzero_v2r8();
+            fjx3             = _fjsp_setzero_v2r8();
+            fjy3             = _fjsp_setzero_v2r8();
+            fjz3             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
+            {
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+            vvdw6            = _fjsp_mul_v2r8(c6_00,rinvsix);
+            vvdw12           = _fjsp_mul_v2r8(c12_00,_fjsp_mul_v2r8(rinvsix,rinvsix));
+            vvdw             = _fjsp_msub_v2r8(_fjsp_nmsub_v2r8(c12_00,_fjsp_mul_v2r8(sh_vdw_invrcut6,sh_vdw_invrcut6),vvdw12),one_twelfth,
+                                           _fjsp_mul_v2r8(_fjsp_nmsub_v2r8( c6_00,sh_vdw_invrcut6,vvdw6),one_sixth));
+            fvdw             = _fjsp_mul_v2r8(_fjsp_sub_v2r8(vvdw12,vvdw6),rinvsq00);
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            vvdw             = _fjsp_and_v2r8(vvdw,cutoff_mask);
+            vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
+
+            fscal            = fvdw;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq11,rcutoff2))
+            {
+
+            r11              = _fjsp_mul_v2r8(rsq11,rinv11);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r11,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq11,_fjsp_sub_v2r8(_fjsp_sub_v2r8(rinv11,sh_ewald),velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq11,rinv11),_fjsp_sub_v2r8(rinvsq11,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq11,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+            
+            fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq12,rcutoff2))
+            {
+
+            r12              = _fjsp_mul_v2r8(rsq12,rinv12);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r12,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq12,_fjsp_sub_v2r8(_fjsp_sub_v2r8(rinv12,sh_ewald),velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq12,rinv12),_fjsp_sub_v2r8(rinvsq12,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq12,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+            
+            fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq13,rcutoff2))
+            {
+
+            r13              = _fjsp_mul_v2r8(rsq13,rinv13);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r13,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq13,_fjsp_sub_v2r8(_fjsp_sub_v2r8(rinv13,sh_ewald),velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq13,rinv13),_fjsp_sub_v2r8(rinvsq13,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq13,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx13,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy13,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz13,fscal,fiz1);
+            
+            fjx3             = _fjsp_madd_v2r8(dx13,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy13,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz13,fscal,fjz3);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq21,rcutoff2))
+            {
+
+            r21              = _fjsp_mul_v2r8(rsq21,rinv21);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r21,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq21,_fjsp_sub_v2r8(_fjsp_sub_v2r8(rinv21,sh_ewald),velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq21,rinv21),_fjsp_sub_v2r8(rinvsq21,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq21,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+            
+            fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq22,rcutoff2))
+            {
+
+            r22              = _fjsp_mul_v2r8(rsq22,rinv22);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r22,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq22,_fjsp_sub_v2r8(_fjsp_sub_v2r8(rinv22,sh_ewald),velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq22,rinv22),_fjsp_sub_v2r8(rinvsq22,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq22,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+            
+            fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq23,rcutoff2))
+            {
+
+            r23              = _fjsp_mul_v2r8(rsq23,rinv23);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r23,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq23,_fjsp_sub_v2r8(_fjsp_sub_v2r8(rinv23,sh_ewald),velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq23,rinv23),_fjsp_sub_v2r8(rinvsq23,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq23,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx23,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy23,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz23,fscal,fiz2);
+            
+            fjx3             = _fjsp_madd_v2r8(dx23,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy23,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz23,fscal,fjz3);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq31,rcutoff2))
+            {
+
+            r31              = _fjsp_mul_v2r8(rsq31,rinv31);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r31,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq31,_fjsp_sub_v2r8(_fjsp_sub_v2r8(rinv31,sh_ewald),velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq31,rinv31),_fjsp_sub_v2r8(rinvsq31,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq31,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx31,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy31,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz31,fscal,fiz3);
+            
+            fjx1             = _fjsp_madd_v2r8(dx31,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy31,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz31,fscal,fjz1);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq32,rcutoff2))
+            {
+
+            r32              = _fjsp_mul_v2r8(rsq32,rinv32);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r32,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq32,_fjsp_sub_v2r8(_fjsp_sub_v2r8(rinv32,sh_ewald),velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq32,rinv32),_fjsp_sub_v2r8(rinvsq32,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq32,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx32,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy32,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz32,fscal,fiz3);
+            
+            fjx2             = _fjsp_madd_v2r8(dx32,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy32,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz32,fscal,fjz2);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq33,rcutoff2))
+            {
+
+            r33              = _fjsp_mul_v2r8(rsq33,rinv33);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r33,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq33,_fjsp_sub_v2r8(_fjsp_sub_v2r8(rinv33,sh_ewald),velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq33,rinv33),_fjsp_sub_v2r8(rinvsq33,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq33,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx33,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy33,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz33,fscal,fiz3);
+            
+            fjx3             = _fjsp_madd_v2r8(dx33,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy33,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz33,fscal,fjz3);
+
+            }
+
+            gmx_fjsp_decrement_4rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+
+            /* Inner loop uses 488 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_4rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
+                                              &jy2,&jz2,&jx3,&jy3,&jz3);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx11             = _fjsp_sub_v2r8(ix1,jx1);
+            dy11             = _fjsp_sub_v2r8(iy1,jy1);
+            dz11             = _fjsp_sub_v2r8(iz1,jz1);
+            dx12             = _fjsp_sub_v2r8(ix1,jx2);
+            dy12             = _fjsp_sub_v2r8(iy1,jy2);
+            dz12             = _fjsp_sub_v2r8(iz1,jz2);
+            dx13             = _fjsp_sub_v2r8(ix1,jx3);
+            dy13             = _fjsp_sub_v2r8(iy1,jy3);
+            dz13             = _fjsp_sub_v2r8(iz1,jz3);
+            dx21             = _fjsp_sub_v2r8(ix2,jx1);
+            dy21             = _fjsp_sub_v2r8(iy2,jy1);
+            dz21             = _fjsp_sub_v2r8(iz2,jz1);
+            dx22             = _fjsp_sub_v2r8(ix2,jx2);
+            dy22             = _fjsp_sub_v2r8(iy2,jy2);
+            dz22             = _fjsp_sub_v2r8(iz2,jz2);
+            dx23             = _fjsp_sub_v2r8(ix2,jx3);
+            dy23             = _fjsp_sub_v2r8(iy2,jy3);
+            dz23             = _fjsp_sub_v2r8(iz2,jz3);
+            dx31             = _fjsp_sub_v2r8(ix3,jx1);
+            dy31             = _fjsp_sub_v2r8(iy3,jy1);
+            dz31             = _fjsp_sub_v2r8(iz3,jz1);
+            dx32             = _fjsp_sub_v2r8(ix3,jx2);
+            dy32             = _fjsp_sub_v2r8(iy3,jy2);
+            dz32             = _fjsp_sub_v2r8(iz3,jz2);
+            dx33             = _fjsp_sub_v2r8(ix3,jx3);
+            dy33             = _fjsp_sub_v2r8(iy3,jy3);
+            dz33             = _fjsp_sub_v2r8(iz3,jz3);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+            rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+            rsq13            = gmx_fjsp_calc_rsq_v2r8(dx13,dy13,dz13);
+            rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+            rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+            rsq23            = gmx_fjsp_calc_rsq_v2r8(dx23,dy23,dz23);
+            rsq31            = gmx_fjsp_calc_rsq_v2r8(dx31,dy31,dz31);
+            rsq32            = gmx_fjsp_calc_rsq_v2r8(dx32,dy32,dz32);
+            rsq33            = gmx_fjsp_calc_rsq_v2r8(dx33,dy33,dz33);
+
+            rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+            rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+            rinv13           = gmx_fjsp_invsqrt_v2r8(rsq13);
+            rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+            rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+            rinv23           = gmx_fjsp_invsqrt_v2r8(rsq23);
+            rinv31           = gmx_fjsp_invsqrt_v2r8(rsq31);
+            rinv32           = gmx_fjsp_invsqrt_v2r8(rsq32);
+            rinv33           = gmx_fjsp_invsqrt_v2r8(rsq33);
+
+            rinvsq00         = gmx_fjsp_inv_v2r8(rsq00);
+            rinvsq11         = _fjsp_mul_v2r8(rinv11,rinv11);
+            rinvsq12         = _fjsp_mul_v2r8(rinv12,rinv12);
+            rinvsq13         = _fjsp_mul_v2r8(rinv13,rinv13);
+            rinvsq21         = _fjsp_mul_v2r8(rinv21,rinv21);
+            rinvsq22         = _fjsp_mul_v2r8(rinv22,rinv22);
+            rinvsq23         = _fjsp_mul_v2r8(rinv23,rinv23);
+            rinvsq31         = _fjsp_mul_v2r8(rinv31,rinv31);
+            rinvsq32         = _fjsp_mul_v2r8(rinv32,rinv32);
+            rinvsq33         = _fjsp_mul_v2r8(rinv33,rinv33);
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+            fjx1             = _fjsp_setzero_v2r8();
+            fjy1             = _fjsp_setzero_v2r8();
+            fjz1             = _fjsp_setzero_v2r8();
+            fjx2             = _fjsp_setzero_v2r8();
+            fjy2             = _fjsp_setzero_v2r8();
+            fjz2             = _fjsp_setzero_v2r8();
+            fjx3             = _fjsp_setzero_v2r8();
+            fjy3             = _fjsp_setzero_v2r8();
+            fjz3             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
+            {
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+            vvdw6            = _fjsp_mul_v2r8(c6_00,rinvsix);
+            vvdw12           = _fjsp_mul_v2r8(c12_00,_fjsp_mul_v2r8(rinvsix,rinvsix));
+            vvdw             = _fjsp_msub_v2r8(_fjsp_nmsub_v2r8(c12_00,_fjsp_mul_v2r8(sh_vdw_invrcut6,sh_vdw_invrcut6),vvdw12),one_twelfth,
+                                           _fjsp_mul_v2r8(_fjsp_nmsub_v2r8( c6_00,sh_vdw_invrcut6,vvdw6),one_sixth));
+            fvdw             = _fjsp_mul_v2r8(_fjsp_sub_v2r8(vvdw12,vvdw6),rinvsq00);
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            vvdw             = _fjsp_and_v2r8(vvdw,cutoff_mask);
+            vvdw             = _fjsp_unpacklo_v2r8(vvdw,_fjsp_setzero_v2r8());
+            vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
+
+            fscal            = fvdw;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq11,rcutoff2))
+            {
+
+            r11              = _fjsp_mul_v2r8(rsq11,rinv11);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r11,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq11,_fjsp_sub_v2r8(_fjsp_sub_v2r8(rinv11,sh_ewald),velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq11,rinv11),_fjsp_sub_v2r8(rinvsq11,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq11,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+            
+            fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq12,rcutoff2))
+            {
+
+            r12              = _fjsp_mul_v2r8(rsq12,rinv12);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r12,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq12,_fjsp_sub_v2r8(_fjsp_sub_v2r8(rinv12,sh_ewald),velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq12,rinv12),_fjsp_sub_v2r8(rinvsq12,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq12,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+            
+            fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq13,rcutoff2))
+            {
+
+            r13              = _fjsp_mul_v2r8(rsq13,rinv13);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r13,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq13,_fjsp_sub_v2r8(_fjsp_sub_v2r8(rinv13,sh_ewald),velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq13,rinv13),_fjsp_sub_v2r8(rinvsq13,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq13,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx13,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy13,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz13,fscal,fiz1);
+            
+            fjx3             = _fjsp_madd_v2r8(dx13,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy13,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz13,fscal,fjz3);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq21,rcutoff2))
+            {
+
+            r21              = _fjsp_mul_v2r8(rsq21,rinv21);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r21,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq21,_fjsp_sub_v2r8(_fjsp_sub_v2r8(rinv21,sh_ewald),velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq21,rinv21),_fjsp_sub_v2r8(rinvsq21,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq21,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+            
+            fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq22,rcutoff2))
+            {
+
+            r22              = _fjsp_mul_v2r8(rsq22,rinv22);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r22,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq22,_fjsp_sub_v2r8(_fjsp_sub_v2r8(rinv22,sh_ewald),velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq22,rinv22),_fjsp_sub_v2r8(rinvsq22,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq22,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+            
+            fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq23,rcutoff2))
+            {
+
+            r23              = _fjsp_mul_v2r8(rsq23,rinv23);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r23,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq23,_fjsp_sub_v2r8(_fjsp_sub_v2r8(rinv23,sh_ewald),velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq23,rinv23),_fjsp_sub_v2r8(rinvsq23,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq23,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx23,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy23,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz23,fscal,fiz2);
+            
+            fjx3             = _fjsp_madd_v2r8(dx23,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy23,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz23,fscal,fjz3);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq31,rcutoff2))
+            {
+
+            r31              = _fjsp_mul_v2r8(rsq31,rinv31);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r31,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq31,_fjsp_sub_v2r8(_fjsp_sub_v2r8(rinv31,sh_ewald),velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq31,rinv31),_fjsp_sub_v2r8(rinvsq31,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq31,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx31,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy31,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz31,fscal,fiz3);
+            
+            fjx1             = _fjsp_madd_v2r8(dx31,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy31,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz31,fscal,fjz1);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq32,rcutoff2))
+            {
+
+            r32              = _fjsp_mul_v2r8(rsq32,rinv32);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r32,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq32,_fjsp_sub_v2r8(_fjsp_sub_v2r8(rinv32,sh_ewald),velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq32,rinv32),_fjsp_sub_v2r8(rinvsq32,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq32,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx32,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy32,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz32,fscal,fiz3);
+            
+            fjx2             = _fjsp_madd_v2r8(dx32,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy32,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz32,fscal,fjz2);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq33,rcutoff2))
+            {
+
+            r33              = _fjsp_mul_v2r8(rsq33,rinv33);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r33,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq33,_fjsp_sub_v2r8(_fjsp_sub_v2r8(rinv33,sh_ewald),velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq33,rinv33),_fjsp_sub_v2r8(rinvsq33,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq33,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx33,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy33,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz33,fscal,fiz3);
+            
+            fjx3             = _fjsp_madd_v2r8(dx33,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy33,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz33,fscal,fjz3);
+
+            }
+
+            gmx_fjsp_decrement_4rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+
+            /* Inner loop uses 488 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_fjsp_update_iforce_4atom_swizzle_v2r8(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        ggid                        = gid[iidx];
+        /* Update potential energies */
+        gmx_fjsp_update_1pot_v2r8(velecsum,kernel_data->energygrp_elec+ggid);
+        gmx_fjsp_update_1pot_v2r8(vvdwsum,kernel_data->energygrp_vdw+ggid);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 26 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_VF,outeriter*26 + inneriter*488);
+}
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecEwSh_VdwLJSh_GeomW4W4_F_sparc64_hpc_ace_double
+ * Electrostatics interaction: Ewald
+ * VdW interaction:            LennardJones
+ * Geometry:                   Water4-Water4
+ * Calculate force/pot:        Force
+ */
+void
+nb_kernel_ElecEwSh_VdwLJSh_GeomW4W4_F_sparc64_hpc_ace_double
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwioffset3;
+    _fjsp_v2r8       ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+    int              vdwjidx0A,vdwjidx0B;
+    _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    int              vdwjidx1A,vdwjidx1B;
+    _fjsp_v2r8       jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+    int              vdwjidx2A,vdwjidx2B;
+    _fjsp_v2r8       jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+    int              vdwjidx3A,vdwjidx3B;
+    _fjsp_v2r8       jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
+    _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    _fjsp_v2r8       dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+    _fjsp_v2r8       dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+    _fjsp_v2r8       dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
+    _fjsp_v2r8       dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+    _fjsp_v2r8       dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+    _fjsp_v2r8       dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
+    _fjsp_v2r8       dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
+    _fjsp_v2r8       dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
+    _fjsp_v2r8       dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
+    _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
+    _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
+    _fjsp_v2r8       ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+    real             *ewtab;
+    _fjsp_v2r8       itab_tmp;
+    _fjsp_v2r8       dummy_mask,cutoff_mask;
+    _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+    _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+    union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+
+    sh_ewald         = gmx_fjsp_set1_v2r8(fr->ic->sh_ewald);
+    ewtab            = fr->ic->tabq_coul_F;
+    ewtabscale       = gmx_fjsp_set1_v2r8(fr->ic->tabq_scale);
+    ewtabhalfspace   = gmx_fjsp_set1_v2r8(0.5/fr->ic->tabq_scale);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+    iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+    iq3              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+3]));
+    vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+    jq1              = gmx_fjsp_set1_v2r8(charge[inr+1]);
+    jq2              = gmx_fjsp_set1_v2r8(charge[inr+2]);
+    jq3              = gmx_fjsp_set1_v2r8(charge[inr+3]);
+    vdwjidx0A        = 2*vdwtype[inr+0];
+    c6_00            = gmx_fjsp_set1_v2r8(vdwparam[vdwioffset0+vdwjidx0A]);
+    c12_00           = gmx_fjsp_set1_v2r8(vdwparam[vdwioffset0+vdwjidx0A+1]);
+    qq11             = _fjsp_mul_v2r8(iq1,jq1);
+    qq12             = _fjsp_mul_v2r8(iq1,jq2);
+    qq13             = _fjsp_mul_v2r8(iq1,jq3);
+    qq21             = _fjsp_mul_v2r8(iq2,jq1);
+    qq22             = _fjsp_mul_v2r8(iq2,jq2);
+    qq23             = _fjsp_mul_v2r8(iq2,jq3);
+    qq31             = _fjsp_mul_v2r8(iq3,jq1);
+    qq32             = _fjsp_mul_v2r8(iq3,jq2);
+    qq33             = _fjsp_mul_v2r8(iq3,jq3);
+
+    /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+    rcutoff_scalar   = fr->rcoulomb;
+    rcutoff          = gmx_fjsp_set1_v2r8(rcutoff_scalar);
+    rcutoff2         = _fjsp_mul_v2r8(rcutoff,rcutoff);
+
+    sh_vdw_invrcut6  = gmx_fjsp_set1_v2r8(fr->ic->sh_invrc6);
+    rvdw             = gmx_fjsp_set1_v2r8(fr->rvdw);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_fjsp_load_shift_and_4rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
+
+        fix0             = _fjsp_setzero_v2r8();
+        fiy0             = _fjsp_setzero_v2r8();
+        fiz0             = _fjsp_setzero_v2r8();
+        fix1             = _fjsp_setzero_v2r8();
+        fiy1             = _fjsp_setzero_v2r8();
+        fiz1             = _fjsp_setzero_v2r8();
+        fix2             = _fjsp_setzero_v2r8();
+        fiy2             = _fjsp_setzero_v2r8();
+        fiz2             = _fjsp_setzero_v2r8();
+        fix3             = _fjsp_setzero_v2r8();
+        fiy3             = _fjsp_setzero_v2r8();
+        fiz3             = _fjsp_setzero_v2r8();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_4rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
+                                              &jy2,&jz2,&jx3,&jy3,&jz3);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx11             = _fjsp_sub_v2r8(ix1,jx1);
+            dy11             = _fjsp_sub_v2r8(iy1,jy1);
+            dz11             = _fjsp_sub_v2r8(iz1,jz1);
+            dx12             = _fjsp_sub_v2r8(ix1,jx2);
+            dy12             = _fjsp_sub_v2r8(iy1,jy2);
+            dz12             = _fjsp_sub_v2r8(iz1,jz2);
+            dx13             = _fjsp_sub_v2r8(ix1,jx3);
+            dy13             = _fjsp_sub_v2r8(iy1,jy3);
+            dz13             = _fjsp_sub_v2r8(iz1,jz3);
+            dx21             = _fjsp_sub_v2r8(ix2,jx1);
+            dy21             = _fjsp_sub_v2r8(iy2,jy1);
+            dz21             = _fjsp_sub_v2r8(iz2,jz1);
+            dx22             = _fjsp_sub_v2r8(ix2,jx2);
+            dy22             = _fjsp_sub_v2r8(iy2,jy2);
+            dz22             = _fjsp_sub_v2r8(iz2,jz2);
+            dx23             = _fjsp_sub_v2r8(ix2,jx3);
+            dy23             = _fjsp_sub_v2r8(iy2,jy3);
+            dz23             = _fjsp_sub_v2r8(iz2,jz3);
+            dx31             = _fjsp_sub_v2r8(ix3,jx1);
+            dy31             = _fjsp_sub_v2r8(iy3,jy1);
+            dz31             = _fjsp_sub_v2r8(iz3,jz1);
+            dx32             = _fjsp_sub_v2r8(ix3,jx2);
+            dy32             = _fjsp_sub_v2r8(iy3,jy2);
+            dz32             = _fjsp_sub_v2r8(iz3,jz2);
+            dx33             = _fjsp_sub_v2r8(ix3,jx3);
+            dy33             = _fjsp_sub_v2r8(iy3,jy3);
+            dz33             = _fjsp_sub_v2r8(iz3,jz3);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+            rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+            rsq13            = gmx_fjsp_calc_rsq_v2r8(dx13,dy13,dz13);
+            rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+            rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+            rsq23            = gmx_fjsp_calc_rsq_v2r8(dx23,dy23,dz23);
+            rsq31            = gmx_fjsp_calc_rsq_v2r8(dx31,dy31,dz31);
+            rsq32            = gmx_fjsp_calc_rsq_v2r8(dx32,dy32,dz32);
+            rsq33            = gmx_fjsp_calc_rsq_v2r8(dx33,dy33,dz33);
+
+            rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+            rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+            rinv13           = gmx_fjsp_invsqrt_v2r8(rsq13);
+            rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+            rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+            rinv23           = gmx_fjsp_invsqrt_v2r8(rsq23);
+            rinv31           = gmx_fjsp_invsqrt_v2r8(rsq31);
+            rinv32           = gmx_fjsp_invsqrt_v2r8(rsq32);
+            rinv33           = gmx_fjsp_invsqrt_v2r8(rsq33);
+
+            rinvsq00         = gmx_fjsp_inv_v2r8(rsq00);
+            rinvsq11         = _fjsp_mul_v2r8(rinv11,rinv11);
+            rinvsq12         = _fjsp_mul_v2r8(rinv12,rinv12);
+            rinvsq13         = _fjsp_mul_v2r8(rinv13,rinv13);
+            rinvsq21         = _fjsp_mul_v2r8(rinv21,rinv21);
+            rinvsq22         = _fjsp_mul_v2r8(rinv22,rinv22);
+            rinvsq23         = _fjsp_mul_v2r8(rinv23,rinv23);
+            rinvsq31         = _fjsp_mul_v2r8(rinv31,rinv31);
+            rinvsq32         = _fjsp_mul_v2r8(rinv32,rinv32);
+            rinvsq33         = _fjsp_mul_v2r8(rinv33,rinv33);
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+            fjx1             = _fjsp_setzero_v2r8();
+            fjy1             = _fjsp_setzero_v2r8();
+            fjz1             = _fjsp_setzero_v2r8();
+            fjx2             = _fjsp_setzero_v2r8();
+            fjy2             = _fjsp_setzero_v2r8();
+            fjz2             = _fjsp_setzero_v2r8();
+            fjx3             = _fjsp_setzero_v2r8();
+            fjy3             = _fjsp_setzero_v2r8();
+            fjz3             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
+            {
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+            fvdw             = _fjsp_mul_v2r8(_fjsp_msub_v2r8(c12_00,rinvsix,c6_00),_fjsp_mul_v2r8(rinvsix,rinvsq00));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
+
+            fscal            = fvdw;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq11,rcutoff2))
+            {
+
+            r11              = _fjsp_mul_v2r8(rsq11,rinv11);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r11,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
+                                         &ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq11,rinv11),_fjsp_sub_v2r8(rinvsq11,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq11,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+            
+            fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq12,rcutoff2))
+            {
+
+            r12              = _fjsp_mul_v2r8(rsq12,rinv12);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r12,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
+                                         &ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq12,rinv12),_fjsp_sub_v2r8(rinvsq12,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq12,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+            
+            fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq13,rcutoff2))
+            {
+
+            r13              = _fjsp_mul_v2r8(rsq13,rinv13);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r13,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
+                                         &ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq13,rinv13),_fjsp_sub_v2r8(rinvsq13,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq13,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx13,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy13,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz13,fscal,fiz1);
+            
+            fjx3             = _fjsp_madd_v2r8(dx13,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy13,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz13,fscal,fjz3);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq21,rcutoff2))
+            {
+
+            r21              = _fjsp_mul_v2r8(rsq21,rinv21);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r21,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
+                                         &ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq21,rinv21),_fjsp_sub_v2r8(rinvsq21,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq21,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+            
+            fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq22,rcutoff2))
+            {
+
+            r22              = _fjsp_mul_v2r8(rsq22,rinv22);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r22,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
+                                         &ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq22,rinv22),_fjsp_sub_v2r8(rinvsq22,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq22,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+            
+            fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq23,rcutoff2))
+            {
+
+            r23              = _fjsp_mul_v2r8(rsq23,rinv23);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r23,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
+                                         &ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq23,rinv23),_fjsp_sub_v2r8(rinvsq23,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq23,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx23,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy23,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz23,fscal,fiz2);
+            
+            fjx3             = _fjsp_madd_v2r8(dx23,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy23,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz23,fscal,fjz3);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq31,rcutoff2))
+            {
+
+            r31              = _fjsp_mul_v2r8(rsq31,rinv31);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r31,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
+                                         &ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq31,rinv31),_fjsp_sub_v2r8(rinvsq31,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq31,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx31,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy31,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz31,fscal,fiz3);
+            
+            fjx1             = _fjsp_madd_v2r8(dx31,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy31,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz31,fscal,fjz1);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq32,rcutoff2))
+            {
+
+            r32              = _fjsp_mul_v2r8(rsq32,rinv32);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r32,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
+                                         &ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq32,rinv32),_fjsp_sub_v2r8(rinvsq32,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq32,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx32,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy32,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz32,fscal,fiz3);
+            
+            fjx2             = _fjsp_madd_v2r8(dx32,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy32,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz32,fscal,fjz2);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq33,rcutoff2))
+            {
+
+            r33              = _fjsp_mul_v2r8(rsq33,rinv33);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r33,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
+                                         &ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq33,rinv33),_fjsp_sub_v2r8(rinvsq33,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq33,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx33,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy33,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz33,fscal,fiz3);
+            
+            fjx3             = _fjsp_madd_v2r8(dx33,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy33,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz33,fscal,fjz3);
+
+            }
+
+            gmx_fjsp_decrement_4rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+
+            /* Inner loop uses 414 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_4rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
+                                              &jy2,&jz2,&jx3,&jy3,&jz3);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx11             = _fjsp_sub_v2r8(ix1,jx1);
+            dy11             = _fjsp_sub_v2r8(iy1,jy1);
+            dz11             = _fjsp_sub_v2r8(iz1,jz1);
+            dx12             = _fjsp_sub_v2r8(ix1,jx2);
+            dy12             = _fjsp_sub_v2r8(iy1,jy2);
+            dz12             = _fjsp_sub_v2r8(iz1,jz2);
+            dx13             = _fjsp_sub_v2r8(ix1,jx3);
+            dy13             = _fjsp_sub_v2r8(iy1,jy3);
+            dz13             = _fjsp_sub_v2r8(iz1,jz3);
+            dx21             = _fjsp_sub_v2r8(ix2,jx1);
+            dy21             = _fjsp_sub_v2r8(iy2,jy1);
+            dz21             = _fjsp_sub_v2r8(iz2,jz1);
+            dx22             = _fjsp_sub_v2r8(ix2,jx2);
+            dy22             = _fjsp_sub_v2r8(iy2,jy2);
+            dz22             = _fjsp_sub_v2r8(iz2,jz2);
+            dx23             = _fjsp_sub_v2r8(ix2,jx3);
+            dy23             = _fjsp_sub_v2r8(iy2,jy3);
+            dz23             = _fjsp_sub_v2r8(iz2,jz3);
+            dx31             = _fjsp_sub_v2r8(ix3,jx1);
+            dy31             = _fjsp_sub_v2r8(iy3,jy1);
+            dz31             = _fjsp_sub_v2r8(iz3,jz1);
+            dx32             = _fjsp_sub_v2r8(ix3,jx2);
+            dy32             = _fjsp_sub_v2r8(iy3,jy2);
+            dz32             = _fjsp_sub_v2r8(iz3,jz2);
+            dx33             = _fjsp_sub_v2r8(ix3,jx3);
+            dy33             = _fjsp_sub_v2r8(iy3,jy3);
+            dz33             = _fjsp_sub_v2r8(iz3,jz3);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+            rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+            rsq13            = gmx_fjsp_calc_rsq_v2r8(dx13,dy13,dz13);
+            rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+            rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+            rsq23            = gmx_fjsp_calc_rsq_v2r8(dx23,dy23,dz23);
+            rsq31            = gmx_fjsp_calc_rsq_v2r8(dx31,dy31,dz31);
+            rsq32            = gmx_fjsp_calc_rsq_v2r8(dx32,dy32,dz32);
+            rsq33            = gmx_fjsp_calc_rsq_v2r8(dx33,dy33,dz33);
+
+            rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+            rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+            rinv13           = gmx_fjsp_invsqrt_v2r8(rsq13);
+            rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+            rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+            rinv23           = gmx_fjsp_invsqrt_v2r8(rsq23);
+            rinv31           = gmx_fjsp_invsqrt_v2r8(rsq31);
+            rinv32           = gmx_fjsp_invsqrt_v2r8(rsq32);
+            rinv33           = gmx_fjsp_invsqrt_v2r8(rsq33);
+
+            rinvsq00         = gmx_fjsp_inv_v2r8(rsq00);
+            rinvsq11         = _fjsp_mul_v2r8(rinv11,rinv11);
+            rinvsq12         = _fjsp_mul_v2r8(rinv12,rinv12);
+            rinvsq13         = _fjsp_mul_v2r8(rinv13,rinv13);
+            rinvsq21         = _fjsp_mul_v2r8(rinv21,rinv21);
+            rinvsq22         = _fjsp_mul_v2r8(rinv22,rinv22);
+            rinvsq23         = _fjsp_mul_v2r8(rinv23,rinv23);
+            rinvsq31         = _fjsp_mul_v2r8(rinv31,rinv31);
+            rinvsq32         = _fjsp_mul_v2r8(rinv32,rinv32);
+            rinvsq33         = _fjsp_mul_v2r8(rinv33,rinv33);
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+            fjx1             = _fjsp_setzero_v2r8();
+            fjy1             = _fjsp_setzero_v2r8();
+            fjz1             = _fjsp_setzero_v2r8();
+            fjx2             = _fjsp_setzero_v2r8();
+            fjy2             = _fjsp_setzero_v2r8();
+            fjz2             = _fjsp_setzero_v2r8();
+            fjx3             = _fjsp_setzero_v2r8();
+            fjy3             = _fjsp_setzero_v2r8();
+            fjz3             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
+            {
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+            fvdw             = _fjsp_mul_v2r8(_fjsp_msub_v2r8(c12_00,rinvsix,c6_00),_fjsp_mul_v2r8(rinvsix,rinvsq00));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
+
+            fscal            = fvdw;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq11,rcutoff2))
+            {
+
+            r11              = _fjsp_mul_v2r8(rsq11,rinv11);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r11,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq11,rinv11),_fjsp_sub_v2r8(rinvsq11,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq11,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+            
+            fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq12,rcutoff2))
+            {
+
+            r12              = _fjsp_mul_v2r8(rsq12,rinv12);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r12,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq12,rinv12),_fjsp_sub_v2r8(rinvsq12,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq12,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+            
+            fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq13,rcutoff2))
+            {
+
+            r13              = _fjsp_mul_v2r8(rsq13,rinv13);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r13,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq13,rinv13),_fjsp_sub_v2r8(rinvsq13,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq13,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx13,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy13,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz13,fscal,fiz1);
+            
+            fjx3             = _fjsp_madd_v2r8(dx13,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy13,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz13,fscal,fjz3);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq21,rcutoff2))
+            {
+
+            r21              = _fjsp_mul_v2r8(rsq21,rinv21);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r21,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq21,rinv21),_fjsp_sub_v2r8(rinvsq21,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq21,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+            
+            fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq22,rcutoff2))
+            {
+
+            r22              = _fjsp_mul_v2r8(rsq22,rinv22);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r22,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq22,rinv22),_fjsp_sub_v2r8(rinvsq22,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq22,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+            
+            fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq23,rcutoff2))
+            {
+
+            r23              = _fjsp_mul_v2r8(rsq23,rinv23);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r23,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq23,rinv23),_fjsp_sub_v2r8(rinvsq23,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq23,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx23,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy23,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz23,fscal,fiz2);
+            
+            fjx3             = _fjsp_madd_v2r8(dx23,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy23,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz23,fscal,fjz3);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq31,rcutoff2))
+            {
+
+            r31              = _fjsp_mul_v2r8(rsq31,rinv31);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r31,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq31,rinv31),_fjsp_sub_v2r8(rinvsq31,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq31,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx31,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy31,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz31,fscal,fiz3);
+            
+            fjx1             = _fjsp_madd_v2r8(dx31,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy31,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz31,fscal,fjz1);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq32,rcutoff2))
+            {
+
+            r32              = _fjsp_mul_v2r8(rsq32,rinv32);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r32,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq32,rinv32),_fjsp_sub_v2r8(rinvsq32,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq32,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx32,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy32,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz32,fscal,fiz3);
+            
+            fjx2             = _fjsp_madd_v2r8(dx32,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy32,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz32,fscal,fjz2);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq33,rcutoff2))
+            {
+
+            r33              = _fjsp_mul_v2r8(rsq33,rinv33);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r33,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq33,rinv33),_fjsp_sub_v2r8(rinvsq33,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq33,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx33,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy33,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz33,fscal,fiz3);
+            
+            fjx3             = _fjsp_madd_v2r8(dx33,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy33,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz33,fscal,fjz3);
+
+            }
+
+            gmx_fjsp_decrement_4rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+
+            /* Inner loop uses 414 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_fjsp_update_iforce_4atom_swizzle_v2r8(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 24 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_F,outeriter*24 + inneriter*414);
+}
diff --git a/src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecEwSh_VdwNone_GeomP1P1_sparc64_hpc_ace_double.c b/src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecEwSh_VdwNone_GeomP1P1_sparc64_hpc_ace_double.c
new file mode 100644 (file)
index 0000000..29c0330
--- /dev/null
@@ -0,0 +1,599 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2012, by the GROMACS development team, led by
+ * David van der Spoel, Berk Hess, Erik Lindahl, and including many
+ * others, as listed in the AUTHORS file in the top-level source
+ * directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*
+ * Note: this file was generated by the GROMACS sparc64_hpc_ace_double kernel generator.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "kernelutil_sparc64_hpc_ace_double.h"
+
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecEwSh_VdwNone_GeomP1P1_VF_sparc64_hpc_ace_double
+ * Electrostatics interaction: Ewald
+ * VdW interaction:            None
+ * Geometry:                   Particle-Particle
+ * Calculate force/pot:        PotentialAndForce
+ */
+void
+nb_kernel_ElecEwSh_VdwNone_GeomP1P1_VF_sparc64_hpc_ace_double
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwjidx0A,vdwjidx0B;
+    _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    _fjsp_v2r8       ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+    real             *ewtab;
+    _fjsp_v2r8       itab_tmp;
+    _fjsp_v2r8       dummy_mask,cutoff_mask;
+    _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+    _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+    union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+    charge           = mdatoms->chargeA;
+
+    sh_ewald         = gmx_fjsp_set1_v2r8(fr->ic->sh_ewald);
+    ewtab            = fr->ic->tabq_coul_FDV0;
+    ewtabscale       = gmx_fjsp_set1_v2r8(fr->ic->tabq_scale);
+    ewtabhalfspace   = gmx_fjsp_set1_v2r8(0.5/fr->ic->tabq_scale);
+
+    /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+    rcutoff_scalar   = fr->rcoulomb;
+    rcutoff          = gmx_fjsp_set1_v2r8(rcutoff_scalar);
+    rcutoff2         = _fjsp_mul_v2r8(rcutoff,rcutoff);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_fjsp_load_shift_and_1rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,&ix0,&iy0,&iz0);
+
+        fix0             = _fjsp_setzero_v2r8();
+        fiy0             = _fjsp_setzero_v2r8();
+        fiz0             = _fjsp_setzero_v2r8();
+
+        /* Load parameters for i particles */
+        iq0              = _fjsp_mul_v2r8(facel,gmx_fjsp_load1_v2r8(charge+inr+0));
+
+        /* Reset potential sums */
+        velecsum         = _fjsp_setzero_v2r8();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_fjsp_load_2real_swizzle_v2r8(charge+jnrA+0,charge+jnrB+0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
+            {
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _fjsp_mul_v2r8(iq0,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r00,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq00,_fjsp_sub_v2r8(_fjsp_sub_v2r8(rinv00,sh_ewald),velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq00,rinv00),_fjsp_sub_v2r8(rinvsq00,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            gmx_fjsp_decrement_fma_1rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fscal,dx00,dy00,dz00);
+
+            }
+
+            /* Inner loop uses 49 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+
+            /* Load parameters for j particles */
+            jq0              = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),charge+jnrA+0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
+            {
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _fjsp_mul_v2r8(iq0,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r00,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq00,_fjsp_sub_v2r8(_fjsp_sub_v2r8(rinv00,sh_ewald),velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq00,rinv00),_fjsp_sub_v2r8(rinvsq00,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            gmx_fjsp_decrement_fma_1rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fscal,dx00,dy00,dz00);
+
+            }
+
+            /* Inner loop uses 49 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_fjsp_update_iforce_1atom_swizzle_v2r8(fix0,fiy0,fiz0,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        ggid                        = gid[iidx];
+        /* Update potential energies */
+        gmx_fjsp_update_1pot_v2r8(velecsum,kernel_data->energygrp_elec+ggid);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 8 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VF,outeriter*8 + inneriter*49);
+}
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecEwSh_VdwNone_GeomP1P1_F_sparc64_hpc_ace_double
+ * Electrostatics interaction: Ewald
+ * VdW interaction:            None
+ * Geometry:                   Particle-Particle
+ * Calculate force/pot:        Force
+ */
+void
+nb_kernel_ElecEwSh_VdwNone_GeomP1P1_F_sparc64_hpc_ace_double
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwjidx0A,vdwjidx0B;
+    _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    _fjsp_v2r8       ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+    real             *ewtab;
+    _fjsp_v2r8       itab_tmp;
+    _fjsp_v2r8       dummy_mask,cutoff_mask;
+    _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+    _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+    union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+    charge           = mdatoms->chargeA;
+
+    sh_ewald         = gmx_fjsp_set1_v2r8(fr->ic->sh_ewald);
+    ewtab            = fr->ic->tabq_coul_F;
+    ewtabscale       = gmx_fjsp_set1_v2r8(fr->ic->tabq_scale);
+    ewtabhalfspace   = gmx_fjsp_set1_v2r8(0.5/fr->ic->tabq_scale);
+
+    /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+    rcutoff_scalar   = fr->rcoulomb;
+    rcutoff          = gmx_fjsp_set1_v2r8(rcutoff_scalar);
+    rcutoff2         = _fjsp_mul_v2r8(rcutoff,rcutoff);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_fjsp_load_shift_and_1rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,&ix0,&iy0,&iz0);
+
+        fix0             = _fjsp_setzero_v2r8();
+        fiy0             = _fjsp_setzero_v2r8();
+        fiz0             = _fjsp_setzero_v2r8();
+
+        /* Load parameters for i particles */
+        iq0              = _fjsp_mul_v2r8(facel,gmx_fjsp_load1_v2r8(charge+inr+0));
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_fjsp_load_2real_swizzle_v2r8(charge+jnrA+0,charge+jnrB+0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
+            {
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _fjsp_mul_v2r8(iq0,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r00,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
+                                         &ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq00,rinv00),_fjsp_sub_v2r8(rinvsq00,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            gmx_fjsp_decrement_fma_1rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fscal,dx00,dy00,dz00);
+
+            }
+
+            /* Inner loop uses 42 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+
+            /* Load parameters for j particles */
+            jq0              = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),charge+jnrA+0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
+            {
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _fjsp_mul_v2r8(iq0,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r00,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq00,rinv00),_fjsp_sub_v2r8(rinvsq00,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            gmx_fjsp_decrement_fma_1rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fscal,dx00,dy00,dz00);
+
+            }
+
+            /* Inner loop uses 42 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_fjsp_update_iforce_1atom_swizzle_v2r8(fix0,fiy0,fiz0,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 7 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_F,outeriter*7 + inneriter*42);
+}
diff --git a/src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecEwSh_VdwNone_GeomW3P1_sparc64_hpc_ace_double.c b/src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecEwSh_VdwNone_GeomW3P1_sparc64_hpc_ace_double.c
new file mode 100644 (file)
index 0000000..2e68463
--- /dev/null
@@ -0,0 +1,1095 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2012, by the GROMACS development team, led by
+ * David van der Spoel, Berk Hess, Erik Lindahl, and including many
+ * others, as listed in the AUTHORS file in the top-level source
+ * directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*
+ * Note: this file was generated by the GROMACS sparc64_hpc_ace_double kernel generator.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "kernelutil_sparc64_hpc_ace_double.h"
+
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecEwSh_VdwNone_GeomW3P1_VF_sparc64_hpc_ace_double
+ * Electrostatics interaction: Ewald
+ * VdW interaction:            None
+ * Geometry:                   Water3-Particle
+ * Calculate force/pot:        PotentialAndForce
+ */
+void
+nb_kernel_ElecEwSh_VdwNone_GeomW3P1_VF_sparc64_hpc_ace_double
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwjidx0A,vdwjidx0B;
+    _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    _fjsp_v2r8       dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    _fjsp_v2r8       dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    _fjsp_v2r8       ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+    real             *ewtab;
+    _fjsp_v2r8       itab_tmp;
+    _fjsp_v2r8       dummy_mask,cutoff_mask;
+    _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+    _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+    union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+    charge           = mdatoms->chargeA;
+
+    sh_ewald         = gmx_fjsp_set1_v2r8(fr->ic->sh_ewald);
+    ewtab            = fr->ic->tabq_coul_FDV0;
+    ewtabscale       = gmx_fjsp_set1_v2r8(fr->ic->tabq_scale);
+    ewtabhalfspace   = gmx_fjsp_set1_v2r8(0.5/fr->ic->tabq_scale);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq0              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+0]));
+    iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+    iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+
+    /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+    rcutoff_scalar   = fr->rcoulomb;
+    rcutoff          = gmx_fjsp_set1_v2r8(rcutoff_scalar);
+    rcutoff2         = _fjsp_mul_v2r8(rcutoff,rcutoff);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_fjsp_load_shift_and_3rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
+
+        fix0             = _fjsp_setzero_v2r8();
+        fiy0             = _fjsp_setzero_v2r8();
+        fiz0             = _fjsp_setzero_v2r8();
+        fix1             = _fjsp_setzero_v2r8();
+        fiy1             = _fjsp_setzero_v2r8();
+        fiz1             = _fjsp_setzero_v2r8();
+        fix2             = _fjsp_setzero_v2r8();
+        fiy2             = _fjsp_setzero_v2r8();
+        fiz2             = _fjsp_setzero_v2r8();
+
+        /* Reset potential sums */
+        velecsum         = _fjsp_setzero_v2r8();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+            rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+            rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_fjsp_load_2real_swizzle_v2r8(charge+jnrA+0,charge+jnrB+0);
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
+            {
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _fjsp_mul_v2r8(iq0,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r00,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq00,_fjsp_sub_v2r8(_fjsp_sub_v2r8(rinv00,sh_ewald),velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq00,rinv00),_fjsp_sub_v2r8(rinvsq00,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq10,rcutoff2))
+            {
+
+            r10              = _fjsp_mul_v2r8(rsq10,rinv10);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _fjsp_mul_v2r8(iq1,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r10,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq10,_fjsp_sub_v2r8(_fjsp_sub_v2r8(rinv10,sh_ewald),velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq10,rinv10),_fjsp_sub_v2r8(rinvsq10,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq10,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq20,rcutoff2))
+            {
+
+            r20              = _fjsp_mul_v2r8(rsq20,rinv20);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _fjsp_mul_v2r8(iq2,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r20,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq20,_fjsp_sub_v2r8(_fjsp_sub_v2r8(rinv20,sh_ewald),velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq20,rinv20),_fjsp_sub_v2r8(rinvsq20,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq20,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            }
+
+            gmx_fjsp_decrement_1rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 150 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+            rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+            rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+
+            /* Load parameters for j particles */
+            jq0              = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),charge+jnrA+0);
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
+            {
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _fjsp_mul_v2r8(iq0,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r00,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq00,_fjsp_sub_v2r8(_fjsp_sub_v2r8(rinv00,sh_ewald),velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq00,rinv00),_fjsp_sub_v2r8(rinvsq00,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq10,rcutoff2))
+            {
+
+            r10              = _fjsp_mul_v2r8(rsq10,rinv10);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _fjsp_mul_v2r8(iq1,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r10,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq10,_fjsp_sub_v2r8(_fjsp_sub_v2r8(rinv10,sh_ewald),velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq10,rinv10),_fjsp_sub_v2r8(rinvsq10,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq10,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq20,rcutoff2))
+            {
+
+            r20              = _fjsp_mul_v2r8(rsq20,rinv20);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _fjsp_mul_v2r8(iq2,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r20,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq20,_fjsp_sub_v2r8(_fjsp_sub_v2r8(rinv20,sh_ewald),velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq20,rinv20),_fjsp_sub_v2r8(rinvsq20,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq20,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            }
+
+            gmx_fjsp_decrement_1rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 150 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_fjsp_update_iforce_3atom_swizzle_v2r8(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        ggid                        = gid[iidx];
+        /* Update potential energies */
+        gmx_fjsp_update_1pot_v2r8(velecsum,kernel_data->energygrp_elec+ggid);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 19 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W3_VF,outeriter*19 + inneriter*150);
+}
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecEwSh_VdwNone_GeomW3P1_F_sparc64_hpc_ace_double
+ * Electrostatics interaction: Ewald
+ * VdW interaction:            None
+ * Geometry:                   Water3-Particle
+ * Calculate force/pot:        Force
+ */
+void
+nb_kernel_ElecEwSh_VdwNone_GeomW3P1_F_sparc64_hpc_ace_double
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwjidx0A,vdwjidx0B;
+    _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    _fjsp_v2r8       dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    _fjsp_v2r8       dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    _fjsp_v2r8       ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+    real             *ewtab;
+    _fjsp_v2r8       itab_tmp;
+    _fjsp_v2r8       dummy_mask,cutoff_mask;
+    _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+    _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+    union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+    charge           = mdatoms->chargeA;
+
+    sh_ewald         = gmx_fjsp_set1_v2r8(fr->ic->sh_ewald);
+    ewtab            = fr->ic->tabq_coul_F;
+    ewtabscale       = gmx_fjsp_set1_v2r8(fr->ic->tabq_scale);
+    ewtabhalfspace   = gmx_fjsp_set1_v2r8(0.5/fr->ic->tabq_scale);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq0              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+0]));
+    iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+    iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+
+    /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+    rcutoff_scalar   = fr->rcoulomb;
+    rcutoff          = gmx_fjsp_set1_v2r8(rcutoff_scalar);
+    rcutoff2         = _fjsp_mul_v2r8(rcutoff,rcutoff);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_fjsp_load_shift_and_3rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
+
+        fix0             = _fjsp_setzero_v2r8();
+        fiy0             = _fjsp_setzero_v2r8();
+        fiz0             = _fjsp_setzero_v2r8();
+        fix1             = _fjsp_setzero_v2r8();
+        fiy1             = _fjsp_setzero_v2r8();
+        fiz1             = _fjsp_setzero_v2r8();
+        fix2             = _fjsp_setzero_v2r8();
+        fiy2             = _fjsp_setzero_v2r8();
+        fiz2             = _fjsp_setzero_v2r8();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+            rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+            rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_fjsp_load_2real_swizzle_v2r8(charge+jnrA+0,charge+jnrB+0);
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
+            {
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _fjsp_mul_v2r8(iq0,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r00,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
+                                         &ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq00,rinv00),_fjsp_sub_v2r8(rinvsq00,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq10,rcutoff2))
+            {
+
+            r10              = _fjsp_mul_v2r8(rsq10,rinv10);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _fjsp_mul_v2r8(iq1,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r10,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
+                                         &ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq10,rinv10),_fjsp_sub_v2r8(rinvsq10,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq10,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq20,rcutoff2))
+            {
+
+            r20              = _fjsp_mul_v2r8(rsq20,rinv20);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _fjsp_mul_v2r8(iq2,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r20,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
+                                         &ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq20,rinv20),_fjsp_sub_v2r8(rinvsq20,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq20,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            }
+
+            gmx_fjsp_decrement_1rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 129 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+            rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+            rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+
+            /* Load parameters for j particles */
+            jq0              = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),charge+jnrA+0);
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
+            {
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _fjsp_mul_v2r8(iq0,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r00,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq00,rinv00),_fjsp_sub_v2r8(rinvsq00,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq10,rcutoff2))
+            {
+
+            r10              = _fjsp_mul_v2r8(rsq10,rinv10);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _fjsp_mul_v2r8(iq1,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r10,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq10,rinv10),_fjsp_sub_v2r8(rinvsq10,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq10,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq20,rcutoff2))
+            {
+
+            r20              = _fjsp_mul_v2r8(rsq20,rinv20);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _fjsp_mul_v2r8(iq2,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r20,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq20,rinv20),_fjsp_sub_v2r8(rinvsq20,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq20,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            }
+
+            gmx_fjsp_decrement_1rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 129 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_fjsp_update_iforce_3atom_swizzle_v2r8(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 18 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W3_F,outeriter*18 + inneriter*129);
+}
diff --git a/src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecEwSh_VdwNone_GeomW3W3_sparc64_hpc_ace_double.c b/src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecEwSh_VdwNone_GeomW3W3_sparc64_hpc_ace_double.c
new file mode 100644 (file)
index 0000000..9f2d5ae
--- /dev/null
@@ -0,0 +1,2341 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2012, by the GROMACS development team, led by
+ * David van der Spoel, Berk Hess, Erik Lindahl, and including many
+ * others, as listed in the AUTHORS file in the top-level source
+ * directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*
+ * Note: this file was generated by the GROMACS sparc64_hpc_ace_double kernel generator.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "kernelutil_sparc64_hpc_ace_double.h"
+
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecEwSh_VdwNone_GeomW3W3_VF_sparc64_hpc_ace_double
+ * Electrostatics interaction: Ewald
+ * VdW interaction:            None
+ * Geometry:                   Water3-Water3
+ * Calculate force/pot:        PotentialAndForce
+ */
+void
+nb_kernel_ElecEwSh_VdwNone_GeomW3W3_VF_sparc64_hpc_ace_double
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwjidx0A,vdwjidx0B;
+    _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    int              vdwjidx1A,vdwjidx1B;
+    _fjsp_v2r8       jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+    int              vdwjidx2A,vdwjidx2B;
+    _fjsp_v2r8       jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+    _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    _fjsp_v2r8       dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
+    _fjsp_v2r8       dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
+    _fjsp_v2r8       dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    _fjsp_v2r8       dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+    _fjsp_v2r8       dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+    _fjsp_v2r8       dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    _fjsp_v2r8       dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+    _fjsp_v2r8       dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+    _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    _fjsp_v2r8       ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+    real             *ewtab;
+    _fjsp_v2r8       itab_tmp;
+    _fjsp_v2r8       dummy_mask,cutoff_mask;
+    _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+    _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+    union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+    charge           = mdatoms->chargeA;
+
+    sh_ewald         = gmx_fjsp_set1_v2r8(fr->ic->sh_ewald);
+    ewtab            = fr->ic->tabq_coul_FDV0;
+    ewtabscale       = gmx_fjsp_set1_v2r8(fr->ic->tabq_scale);
+    ewtabhalfspace   = gmx_fjsp_set1_v2r8(0.5/fr->ic->tabq_scale);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq0              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+0]));
+    iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+    iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+
+    jq0              = gmx_fjsp_set1_v2r8(charge[inr+0]);
+    jq1              = gmx_fjsp_set1_v2r8(charge[inr+1]);
+    jq2              = gmx_fjsp_set1_v2r8(charge[inr+2]);
+    qq00             = _fjsp_mul_v2r8(iq0,jq0);
+    qq01             = _fjsp_mul_v2r8(iq0,jq1);
+    qq02             = _fjsp_mul_v2r8(iq0,jq2);
+    qq10             = _fjsp_mul_v2r8(iq1,jq0);
+    qq11             = _fjsp_mul_v2r8(iq1,jq1);
+    qq12             = _fjsp_mul_v2r8(iq1,jq2);
+    qq20             = _fjsp_mul_v2r8(iq2,jq0);
+    qq21             = _fjsp_mul_v2r8(iq2,jq1);
+    qq22             = _fjsp_mul_v2r8(iq2,jq2);
+
+    /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+    rcutoff_scalar   = fr->rcoulomb;
+    rcutoff          = gmx_fjsp_set1_v2r8(rcutoff_scalar);
+    rcutoff2         = _fjsp_mul_v2r8(rcutoff,rcutoff);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_fjsp_load_shift_and_3rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
+
+        fix0             = _fjsp_setzero_v2r8();
+        fiy0             = _fjsp_setzero_v2r8();
+        fiz0             = _fjsp_setzero_v2r8();
+        fix1             = _fjsp_setzero_v2r8();
+        fiy1             = _fjsp_setzero_v2r8();
+        fiz1             = _fjsp_setzero_v2r8();
+        fix2             = _fjsp_setzero_v2r8();
+        fiy2             = _fjsp_setzero_v2r8();
+        fiz2             = _fjsp_setzero_v2r8();
+
+        /* Reset potential sums */
+        velecsum         = _fjsp_setzero_v2r8();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_3rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx01             = _fjsp_sub_v2r8(ix0,jx1);
+            dy01             = _fjsp_sub_v2r8(iy0,jy1);
+            dz01             = _fjsp_sub_v2r8(iz0,jz1);
+            dx02             = _fjsp_sub_v2r8(ix0,jx2);
+            dy02             = _fjsp_sub_v2r8(iy0,jy2);
+            dz02             = _fjsp_sub_v2r8(iz0,jz2);
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx11             = _fjsp_sub_v2r8(ix1,jx1);
+            dy11             = _fjsp_sub_v2r8(iy1,jy1);
+            dz11             = _fjsp_sub_v2r8(iz1,jz1);
+            dx12             = _fjsp_sub_v2r8(ix1,jx2);
+            dy12             = _fjsp_sub_v2r8(iy1,jy2);
+            dz12             = _fjsp_sub_v2r8(iz1,jz2);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+            dx21             = _fjsp_sub_v2r8(ix2,jx1);
+            dy21             = _fjsp_sub_v2r8(iy2,jy1);
+            dz21             = _fjsp_sub_v2r8(iz2,jz1);
+            dx22             = _fjsp_sub_v2r8(ix2,jx2);
+            dy22             = _fjsp_sub_v2r8(iy2,jy2);
+            dz22             = _fjsp_sub_v2r8(iz2,jz2);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq01            = gmx_fjsp_calc_rsq_v2r8(dx01,dy01,dz01);
+            rsq02            = gmx_fjsp_calc_rsq_v2r8(dx02,dy02,dz02);
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+            rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+            rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+            rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv01           = gmx_fjsp_invsqrt_v2r8(rsq01);
+            rinv02           = gmx_fjsp_invsqrt_v2r8(rsq02);
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+            rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+            rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+            rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+            rinvsq01         = _fjsp_mul_v2r8(rinv01,rinv01);
+            rinvsq02         = _fjsp_mul_v2r8(rinv02,rinv02);
+            rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+            rinvsq11         = _fjsp_mul_v2r8(rinv11,rinv11);
+            rinvsq12         = _fjsp_mul_v2r8(rinv12,rinv12);
+            rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+            rinvsq21         = _fjsp_mul_v2r8(rinv21,rinv21);
+            rinvsq22         = _fjsp_mul_v2r8(rinv22,rinv22);
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+            fjx1             = _fjsp_setzero_v2r8();
+            fjy1             = _fjsp_setzero_v2r8();
+            fjz1             = _fjsp_setzero_v2r8();
+            fjx2             = _fjsp_setzero_v2r8();
+            fjy2             = _fjsp_setzero_v2r8();
+            fjz2             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
+            {
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r00,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq00,_fjsp_sub_v2r8(_fjsp_sub_v2r8(rinv00,sh_ewald),velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq00,rinv00),_fjsp_sub_v2r8(rinvsq00,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq01,rcutoff2))
+            {
+
+            r01              = _fjsp_mul_v2r8(rsq01,rinv01);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r01,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq01,_fjsp_sub_v2r8(_fjsp_sub_v2r8(rinv01,sh_ewald),velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq01,rinv01),_fjsp_sub_v2r8(rinvsq01,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq01,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx01,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy01,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz01,fscal,fiz0);
+            
+            fjx1             = _fjsp_madd_v2r8(dx01,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy01,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz01,fscal,fjz1);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq02,rcutoff2))
+            {
+
+            r02              = _fjsp_mul_v2r8(rsq02,rinv02);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r02,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq02,_fjsp_sub_v2r8(_fjsp_sub_v2r8(rinv02,sh_ewald),velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq02,rinv02),_fjsp_sub_v2r8(rinvsq02,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq02,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx02,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy02,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz02,fscal,fiz0);
+            
+            fjx2             = _fjsp_madd_v2r8(dx02,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy02,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz02,fscal,fjz2);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq10,rcutoff2))
+            {
+
+            r10              = _fjsp_mul_v2r8(rsq10,rinv10);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r10,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq10,_fjsp_sub_v2r8(_fjsp_sub_v2r8(rinv10,sh_ewald),velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq10,rinv10),_fjsp_sub_v2r8(rinvsq10,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq10,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq11,rcutoff2))
+            {
+
+            r11              = _fjsp_mul_v2r8(rsq11,rinv11);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r11,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq11,_fjsp_sub_v2r8(_fjsp_sub_v2r8(rinv11,sh_ewald),velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq11,rinv11),_fjsp_sub_v2r8(rinvsq11,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq11,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+            
+            fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq12,rcutoff2))
+            {
+
+            r12              = _fjsp_mul_v2r8(rsq12,rinv12);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r12,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq12,_fjsp_sub_v2r8(_fjsp_sub_v2r8(rinv12,sh_ewald),velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq12,rinv12),_fjsp_sub_v2r8(rinvsq12,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq12,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+            
+            fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq20,rcutoff2))
+            {
+
+            r20              = _fjsp_mul_v2r8(rsq20,rinv20);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r20,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq20,_fjsp_sub_v2r8(_fjsp_sub_v2r8(rinv20,sh_ewald),velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq20,rinv20),_fjsp_sub_v2r8(rinvsq20,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq20,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq21,rcutoff2))
+            {
+
+            r21              = _fjsp_mul_v2r8(rsq21,rinv21);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r21,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq21,_fjsp_sub_v2r8(_fjsp_sub_v2r8(rinv21,sh_ewald),velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq21,rinv21),_fjsp_sub_v2r8(rinvsq21,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq21,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+            
+            fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq22,rcutoff2))
+            {
+
+            r22              = _fjsp_mul_v2r8(rsq22,rinv22);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r22,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq22,_fjsp_sub_v2r8(_fjsp_sub_v2r8(rinv22,sh_ewald),velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq22,rinv22),_fjsp_sub_v2r8(rinvsq22,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq22,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+            
+            fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+
+            }
+
+            gmx_fjsp_decrement_3rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+
+            /* Inner loop uses 441 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_3rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx01             = _fjsp_sub_v2r8(ix0,jx1);
+            dy01             = _fjsp_sub_v2r8(iy0,jy1);
+            dz01             = _fjsp_sub_v2r8(iz0,jz1);
+            dx02             = _fjsp_sub_v2r8(ix0,jx2);
+            dy02             = _fjsp_sub_v2r8(iy0,jy2);
+            dz02             = _fjsp_sub_v2r8(iz0,jz2);
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx11             = _fjsp_sub_v2r8(ix1,jx1);
+            dy11             = _fjsp_sub_v2r8(iy1,jy1);
+            dz11             = _fjsp_sub_v2r8(iz1,jz1);
+            dx12             = _fjsp_sub_v2r8(ix1,jx2);
+            dy12             = _fjsp_sub_v2r8(iy1,jy2);
+            dz12             = _fjsp_sub_v2r8(iz1,jz2);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+            dx21             = _fjsp_sub_v2r8(ix2,jx1);
+            dy21             = _fjsp_sub_v2r8(iy2,jy1);
+            dz21             = _fjsp_sub_v2r8(iz2,jz1);
+            dx22             = _fjsp_sub_v2r8(ix2,jx2);
+            dy22             = _fjsp_sub_v2r8(iy2,jy2);
+            dz22             = _fjsp_sub_v2r8(iz2,jz2);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq01            = gmx_fjsp_calc_rsq_v2r8(dx01,dy01,dz01);
+            rsq02            = gmx_fjsp_calc_rsq_v2r8(dx02,dy02,dz02);
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+            rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+            rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+            rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv01           = gmx_fjsp_invsqrt_v2r8(rsq01);
+            rinv02           = gmx_fjsp_invsqrt_v2r8(rsq02);
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+            rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+            rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+            rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+            rinvsq01         = _fjsp_mul_v2r8(rinv01,rinv01);
+            rinvsq02         = _fjsp_mul_v2r8(rinv02,rinv02);
+            rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+            rinvsq11         = _fjsp_mul_v2r8(rinv11,rinv11);
+            rinvsq12         = _fjsp_mul_v2r8(rinv12,rinv12);
+            rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+            rinvsq21         = _fjsp_mul_v2r8(rinv21,rinv21);
+            rinvsq22         = _fjsp_mul_v2r8(rinv22,rinv22);
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+            fjx1             = _fjsp_setzero_v2r8();
+            fjy1             = _fjsp_setzero_v2r8();
+            fjz1             = _fjsp_setzero_v2r8();
+            fjx2             = _fjsp_setzero_v2r8();
+            fjy2             = _fjsp_setzero_v2r8();
+            fjz2             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
+            {
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r00,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq00,_fjsp_sub_v2r8(_fjsp_sub_v2r8(rinv00,sh_ewald),velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq00,rinv00),_fjsp_sub_v2r8(rinvsq00,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq01,rcutoff2))
+            {
+
+            r01              = _fjsp_mul_v2r8(rsq01,rinv01);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r01,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq01,_fjsp_sub_v2r8(_fjsp_sub_v2r8(rinv01,sh_ewald),velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq01,rinv01),_fjsp_sub_v2r8(rinvsq01,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq01,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx01,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy01,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz01,fscal,fiz0);
+            
+            fjx1             = _fjsp_madd_v2r8(dx01,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy01,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz01,fscal,fjz1);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq02,rcutoff2))
+            {
+
+            r02              = _fjsp_mul_v2r8(rsq02,rinv02);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r02,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq02,_fjsp_sub_v2r8(_fjsp_sub_v2r8(rinv02,sh_ewald),velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq02,rinv02),_fjsp_sub_v2r8(rinvsq02,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq02,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx02,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy02,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz02,fscal,fiz0);
+            
+            fjx2             = _fjsp_madd_v2r8(dx02,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy02,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz02,fscal,fjz2);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq10,rcutoff2))
+            {
+
+            r10              = _fjsp_mul_v2r8(rsq10,rinv10);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r10,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq10,_fjsp_sub_v2r8(_fjsp_sub_v2r8(rinv10,sh_ewald),velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq10,rinv10),_fjsp_sub_v2r8(rinvsq10,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq10,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq11,rcutoff2))
+            {
+
+            r11              = _fjsp_mul_v2r8(rsq11,rinv11);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r11,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq11,_fjsp_sub_v2r8(_fjsp_sub_v2r8(rinv11,sh_ewald),velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq11,rinv11),_fjsp_sub_v2r8(rinvsq11,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq11,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+            
+            fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq12,rcutoff2))
+            {
+
+            r12              = _fjsp_mul_v2r8(rsq12,rinv12);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r12,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq12,_fjsp_sub_v2r8(_fjsp_sub_v2r8(rinv12,sh_ewald),velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq12,rinv12),_fjsp_sub_v2r8(rinvsq12,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq12,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+            
+            fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq20,rcutoff2))
+            {
+
+            r20              = _fjsp_mul_v2r8(rsq20,rinv20);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r20,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq20,_fjsp_sub_v2r8(_fjsp_sub_v2r8(rinv20,sh_ewald),velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq20,rinv20),_fjsp_sub_v2r8(rinvsq20,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq20,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq21,rcutoff2))
+            {
+
+            r21              = _fjsp_mul_v2r8(rsq21,rinv21);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r21,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq21,_fjsp_sub_v2r8(_fjsp_sub_v2r8(rinv21,sh_ewald),velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq21,rinv21),_fjsp_sub_v2r8(rinvsq21,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq21,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+            
+            fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq22,rcutoff2))
+            {
+
+            r22              = _fjsp_mul_v2r8(rsq22,rinv22);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r22,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq22,_fjsp_sub_v2r8(_fjsp_sub_v2r8(rinv22,sh_ewald),velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq22,rinv22),_fjsp_sub_v2r8(rinvsq22,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq22,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+            
+            fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+
+            }
+
+            gmx_fjsp_decrement_3rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+
+            /* Inner loop uses 441 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_fjsp_update_iforce_3atom_swizzle_v2r8(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        ggid                        = gid[iidx];
+        /* Update potential energies */
+        gmx_fjsp_update_1pot_v2r8(velecsum,kernel_data->energygrp_elec+ggid);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 19 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W3W3_VF,outeriter*19 + inneriter*441);
+}
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecEwSh_VdwNone_GeomW3W3_F_sparc64_hpc_ace_double
+ * Electrostatics interaction: Ewald
+ * VdW interaction:            None
+ * Geometry:                   Water3-Water3
+ * Calculate force/pot:        Force
+ */
+void
+nb_kernel_ElecEwSh_VdwNone_GeomW3W3_F_sparc64_hpc_ace_double
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwjidx0A,vdwjidx0B;
+    _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    int              vdwjidx1A,vdwjidx1B;
+    _fjsp_v2r8       jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+    int              vdwjidx2A,vdwjidx2B;
+    _fjsp_v2r8       jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+    _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    _fjsp_v2r8       dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
+    _fjsp_v2r8       dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
+    _fjsp_v2r8       dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    _fjsp_v2r8       dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+    _fjsp_v2r8       dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+    _fjsp_v2r8       dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    _fjsp_v2r8       dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+    _fjsp_v2r8       dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+    _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    _fjsp_v2r8       ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+    real             *ewtab;
+    _fjsp_v2r8       itab_tmp;
+    _fjsp_v2r8       dummy_mask,cutoff_mask;
+    _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+    _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+    union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+    charge           = mdatoms->chargeA;
+
+    sh_ewald         = gmx_fjsp_set1_v2r8(fr->ic->sh_ewald);
+    ewtab            = fr->ic->tabq_coul_F;
+    ewtabscale       = gmx_fjsp_set1_v2r8(fr->ic->tabq_scale);
+    ewtabhalfspace   = gmx_fjsp_set1_v2r8(0.5/fr->ic->tabq_scale);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq0              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+0]));
+    iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+    iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+
+    jq0              = gmx_fjsp_set1_v2r8(charge[inr+0]);
+    jq1              = gmx_fjsp_set1_v2r8(charge[inr+1]);
+    jq2              = gmx_fjsp_set1_v2r8(charge[inr+2]);
+    qq00             = _fjsp_mul_v2r8(iq0,jq0);
+    qq01             = _fjsp_mul_v2r8(iq0,jq1);
+    qq02             = _fjsp_mul_v2r8(iq0,jq2);
+    qq10             = _fjsp_mul_v2r8(iq1,jq0);
+    qq11             = _fjsp_mul_v2r8(iq1,jq1);
+    qq12             = _fjsp_mul_v2r8(iq1,jq2);
+    qq20             = _fjsp_mul_v2r8(iq2,jq0);
+    qq21             = _fjsp_mul_v2r8(iq2,jq1);
+    qq22             = _fjsp_mul_v2r8(iq2,jq2);
+
+    /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+    rcutoff_scalar   = fr->rcoulomb;
+    rcutoff          = gmx_fjsp_set1_v2r8(rcutoff_scalar);
+    rcutoff2         = _fjsp_mul_v2r8(rcutoff,rcutoff);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_fjsp_load_shift_and_3rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
+
+        fix0             = _fjsp_setzero_v2r8();
+        fiy0             = _fjsp_setzero_v2r8();
+        fiz0             = _fjsp_setzero_v2r8();
+        fix1             = _fjsp_setzero_v2r8();
+        fiy1             = _fjsp_setzero_v2r8();
+        fiz1             = _fjsp_setzero_v2r8();
+        fix2             = _fjsp_setzero_v2r8();
+        fiy2             = _fjsp_setzero_v2r8();
+        fiz2             = _fjsp_setzero_v2r8();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_3rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx01             = _fjsp_sub_v2r8(ix0,jx1);
+            dy01             = _fjsp_sub_v2r8(iy0,jy1);
+            dz01             = _fjsp_sub_v2r8(iz0,jz1);
+            dx02             = _fjsp_sub_v2r8(ix0,jx2);
+            dy02             = _fjsp_sub_v2r8(iy0,jy2);
+            dz02             = _fjsp_sub_v2r8(iz0,jz2);
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx11             = _fjsp_sub_v2r8(ix1,jx1);
+            dy11             = _fjsp_sub_v2r8(iy1,jy1);
+            dz11             = _fjsp_sub_v2r8(iz1,jz1);
+            dx12             = _fjsp_sub_v2r8(ix1,jx2);
+            dy12             = _fjsp_sub_v2r8(iy1,jy2);
+            dz12             = _fjsp_sub_v2r8(iz1,jz2);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+            dx21             = _fjsp_sub_v2r8(ix2,jx1);
+            dy21             = _fjsp_sub_v2r8(iy2,jy1);
+            dz21             = _fjsp_sub_v2r8(iz2,jz1);
+            dx22             = _fjsp_sub_v2r8(ix2,jx2);
+            dy22             = _fjsp_sub_v2r8(iy2,jy2);
+            dz22             = _fjsp_sub_v2r8(iz2,jz2);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq01            = gmx_fjsp_calc_rsq_v2r8(dx01,dy01,dz01);
+            rsq02            = gmx_fjsp_calc_rsq_v2r8(dx02,dy02,dz02);
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+            rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+            rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+            rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv01           = gmx_fjsp_invsqrt_v2r8(rsq01);
+            rinv02           = gmx_fjsp_invsqrt_v2r8(rsq02);
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+            rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+            rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+            rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+            rinvsq01         = _fjsp_mul_v2r8(rinv01,rinv01);
+            rinvsq02         = _fjsp_mul_v2r8(rinv02,rinv02);
+            rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+            rinvsq11         = _fjsp_mul_v2r8(rinv11,rinv11);
+            rinvsq12         = _fjsp_mul_v2r8(rinv12,rinv12);
+            rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+            rinvsq21         = _fjsp_mul_v2r8(rinv21,rinv21);
+            rinvsq22         = _fjsp_mul_v2r8(rinv22,rinv22);
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+            fjx1             = _fjsp_setzero_v2r8();
+            fjy1             = _fjsp_setzero_v2r8();
+            fjz1             = _fjsp_setzero_v2r8();
+            fjx2             = _fjsp_setzero_v2r8();
+            fjy2             = _fjsp_setzero_v2r8();
+            fjz2             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
+            {
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r00,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
+                                         &ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq00,rinv00),_fjsp_sub_v2r8(rinvsq00,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq01,rcutoff2))
+            {
+
+            r01              = _fjsp_mul_v2r8(rsq01,rinv01);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r01,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
+                                         &ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq01,rinv01),_fjsp_sub_v2r8(rinvsq01,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq01,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx01,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy01,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz01,fscal,fiz0);
+            
+            fjx1             = _fjsp_madd_v2r8(dx01,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy01,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz01,fscal,fjz1);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq02,rcutoff2))
+            {
+
+            r02              = _fjsp_mul_v2r8(rsq02,rinv02);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r02,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
+                                         &ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq02,rinv02),_fjsp_sub_v2r8(rinvsq02,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq02,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx02,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy02,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz02,fscal,fiz0);
+            
+            fjx2             = _fjsp_madd_v2r8(dx02,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy02,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz02,fscal,fjz2);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq10,rcutoff2))
+            {
+
+            r10              = _fjsp_mul_v2r8(rsq10,rinv10);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r10,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
+                                         &ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq10,rinv10),_fjsp_sub_v2r8(rinvsq10,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq10,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq11,rcutoff2))
+            {
+
+            r11              = _fjsp_mul_v2r8(rsq11,rinv11);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r11,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
+                                         &ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq11,rinv11),_fjsp_sub_v2r8(rinvsq11,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq11,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+            
+            fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq12,rcutoff2))
+            {
+
+            r12              = _fjsp_mul_v2r8(rsq12,rinv12);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r12,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
+                                         &ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq12,rinv12),_fjsp_sub_v2r8(rinvsq12,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq12,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+            
+            fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq20,rcutoff2))
+            {
+
+            r20              = _fjsp_mul_v2r8(rsq20,rinv20);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r20,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
+                                         &ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq20,rinv20),_fjsp_sub_v2r8(rinvsq20,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq20,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq21,rcutoff2))
+            {
+
+            r21              = _fjsp_mul_v2r8(rsq21,rinv21);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r21,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
+                                         &ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq21,rinv21),_fjsp_sub_v2r8(rinvsq21,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq21,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+            
+            fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq22,rcutoff2))
+            {
+
+            r22              = _fjsp_mul_v2r8(rsq22,rinv22);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r22,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
+                                         &ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq22,rinv22),_fjsp_sub_v2r8(rinvsq22,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq22,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+            
+            fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+
+            }
+
+            gmx_fjsp_decrement_3rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+
+            /* Inner loop uses 378 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_3rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx01             = _fjsp_sub_v2r8(ix0,jx1);
+            dy01             = _fjsp_sub_v2r8(iy0,jy1);
+            dz01             = _fjsp_sub_v2r8(iz0,jz1);
+            dx02             = _fjsp_sub_v2r8(ix0,jx2);
+            dy02             = _fjsp_sub_v2r8(iy0,jy2);
+            dz02             = _fjsp_sub_v2r8(iz0,jz2);
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx11             = _fjsp_sub_v2r8(ix1,jx1);
+            dy11             = _fjsp_sub_v2r8(iy1,jy1);
+            dz11             = _fjsp_sub_v2r8(iz1,jz1);
+            dx12             = _fjsp_sub_v2r8(ix1,jx2);
+            dy12             = _fjsp_sub_v2r8(iy1,jy2);
+            dz12             = _fjsp_sub_v2r8(iz1,jz2);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+            dx21             = _fjsp_sub_v2r8(ix2,jx1);
+            dy21             = _fjsp_sub_v2r8(iy2,jy1);
+            dz21             = _fjsp_sub_v2r8(iz2,jz1);
+            dx22             = _fjsp_sub_v2r8(ix2,jx2);
+            dy22             = _fjsp_sub_v2r8(iy2,jy2);
+            dz22             = _fjsp_sub_v2r8(iz2,jz2);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq01            = gmx_fjsp_calc_rsq_v2r8(dx01,dy01,dz01);
+            rsq02            = gmx_fjsp_calc_rsq_v2r8(dx02,dy02,dz02);
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+            rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+            rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+            rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv01           = gmx_fjsp_invsqrt_v2r8(rsq01);
+            rinv02           = gmx_fjsp_invsqrt_v2r8(rsq02);
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+            rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+            rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+            rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+            rinvsq01         = _fjsp_mul_v2r8(rinv01,rinv01);
+            rinvsq02         = _fjsp_mul_v2r8(rinv02,rinv02);
+            rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+            rinvsq11         = _fjsp_mul_v2r8(rinv11,rinv11);
+            rinvsq12         = _fjsp_mul_v2r8(rinv12,rinv12);
+            rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+            rinvsq21         = _fjsp_mul_v2r8(rinv21,rinv21);
+            rinvsq22         = _fjsp_mul_v2r8(rinv22,rinv22);
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+            fjx1             = _fjsp_setzero_v2r8();
+            fjy1             = _fjsp_setzero_v2r8();
+            fjz1             = _fjsp_setzero_v2r8();
+            fjx2             = _fjsp_setzero_v2r8();
+            fjy2             = _fjsp_setzero_v2r8();
+            fjz2             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
+            {
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r00,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq00,rinv00),_fjsp_sub_v2r8(rinvsq00,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq01,rcutoff2))
+            {
+
+            r01              = _fjsp_mul_v2r8(rsq01,rinv01);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r01,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq01,rinv01),_fjsp_sub_v2r8(rinvsq01,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq01,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx01,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy01,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz01,fscal,fiz0);
+            
+            fjx1             = _fjsp_madd_v2r8(dx01,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy01,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz01,fscal,fjz1);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq02,rcutoff2))
+            {
+
+            r02              = _fjsp_mul_v2r8(rsq02,rinv02);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r02,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq02,rinv02),_fjsp_sub_v2r8(rinvsq02,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq02,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx02,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy02,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz02,fscal,fiz0);
+            
+            fjx2             = _fjsp_madd_v2r8(dx02,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy02,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz02,fscal,fjz2);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq10,rcutoff2))
+            {
+
+            r10              = _fjsp_mul_v2r8(rsq10,rinv10);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r10,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq10,rinv10),_fjsp_sub_v2r8(rinvsq10,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq10,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq11,rcutoff2))
+            {
+
+            r11              = _fjsp_mul_v2r8(rsq11,rinv11);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r11,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq11,rinv11),_fjsp_sub_v2r8(rinvsq11,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq11,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+            
+            fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq12,rcutoff2))
+            {
+
+            r12              = _fjsp_mul_v2r8(rsq12,rinv12);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r12,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq12,rinv12),_fjsp_sub_v2r8(rinvsq12,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq12,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+            
+            fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq20,rcutoff2))
+            {
+
+            r20              = _fjsp_mul_v2r8(rsq20,rinv20);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r20,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq20,rinv20),_fjsp_sub_v2r8(rinvsq20,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq20,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq21,rcutoff2))
+            {
+
+            r21              = _fjsp_mul_v2r8(rsq21,rinv21);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r21,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq21,rinv21),_fjsp_sub_v2r8(rinvsq21,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq21,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+            
+            fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq22,rcutoff2))
+            {
+
+            r22              = _fjsp_mul_v2r8(rsq22,rinv22);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r22,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq22,rinv22),_fjsp_sub_v2r8(rinvsq22,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq22,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+            
+            fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+
+            }
+
+            gmx_fjsp_decrement_3rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+
+            /* Inner loop uses 378 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_fjsp_update_iforce_3atom_swizzle_v2r8(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 18 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W3W3_F,outeriter*18 + inneriter*378);
+}
diff --git a/src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecEwSh_VdwNone_GeomW4P1_sparc64_hpc_ace_double.c b/src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecEwSh_VdwNone_GeomW4P1_sparc64_hpc_ace_double.c
new file mode 100644 (file)
index 0000000..8c18557
--- /dev/null
@@ -0,0 +1,1095 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2012, by the GROMACS development team, led by
+ * David van der Spoel, Berk Hess, Erik Lindahl, and including many
+ * others, as listed in the AUTHORS file in the top-level source
+ * directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*
+ * Note: this file was generated by the GROMACS sparc64_hpc_ace_double kernel generator.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "kernelutil_sparc64_hpc_ace_double.h"
+
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecEwSh_VdwNone_GeomW4P1_VF_sparc64_hpc_ace_double
+ * Electrostatics interaction: Ewald
+ * VdW interaction:            None
+ * Geometry:                   Water4-Particle
+ * Calculate force/pot:        PotentialAndForce
+ */
+void
+nb_kernel_ElecEwSh_VdwNone_GeomW4P1_VF_sparc64_hpc_ace_double
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset1;
+    _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwioffset3;
+    _fjsp_v2r8       ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+    int              vdwjidx0A,vdwjidx0B;
+    _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    _fjsp_v2r8       dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    _fjsp_v2r8       dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    _fjsp_v2r8       dx30,dy30,dz30,rsq30,rinv30,rinvsq30,r30,qq30,c6_30,c12_30;
+    _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    _fjsp_v2r8       ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+    real             *ewtab;
+    _fjsp_v2r8       itab_tmp;
+    _fjsp_v2r8       dummy_mask,cutoff_mask;
+    _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+    _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+    union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+    charge           = mdatoms->chargeA;
+
+    sh_ewald         = gmx_fjsp_set1_v2r8(fr->ic->sh_ewald);
+    ewtab            = fr->ic->tabq_coul_FDV0;
+    ewtabscale       = gmx_fjsp_set1_v2r8(fr->ic->tabq_scale);
+    ewtabhalfspace   = gmx_fjsp_set1_v2r8(0.5/fr->ic->tabq_scale);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+    iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+    iq3              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+3]));
+
+    /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+    rcutoff_scalar   = fr->rcoulomb;
+    rcutoff          = gmx_fjsp_set1_v2r8(rcutoff_scalar);
+    rcutoff2         = _fjsp_mul_v2r8(rcutoff,rcutoff);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_fjsp_load_shift_and_3rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset+DIM,
+                                                 &ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
+
+        fix1             = _fjsp_setzero_v2r8();
+        fiy1             = _fjsp_setzero_v2r8();
+        fiz1             = _fjsp_setzero_v2r8();
+        fix2             = _fjsp_setzero_v2r8();
+        fiy2             = _fjsp_setzero_v2r8();
+        fiz2             = _fjsp_setzero_v2r8();
+        fix3             = _fjsp_setzero_v2r8();
+        fiy3             = _fjsp_setzero_v2r8();
+        fiz3             = _fjsp_setzero_v2r8();
+
+        /* Reset potential sums */
+        velecsum         = _fjsp_setzero_v2r8();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+            dx30             = _fjsp_sub_v2r8(ix3,jx0);
+            dy30             = _fjsp_sub_v2r8(iy3,jy0);
+            dz30             = _fjsp_sub_v2r8(iz3,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+            rsq30            = gmx_fjsp_calc_rsq_v2r8(dx30,dy30,dz30);
+
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+            rinv30           = gmx_fjsp_invsqrt_v2r8(rsq30);
+
+            rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+            rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+            rinvsq30         = _fjsp_mul_v2r8(rinv30,rinv30);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_fjsp_load_2real_swizzle_v2r8(charge+jnrA+0,charge+jnrB+0);
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq10,rcutoff2))
+            {
+
+            r10              = _fjsp_mul_v2r8(rsq10,rinv10);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _fjsp_mul_v2r8(iq1,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r10,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq10,_fjsp_sub_v2r8(_fjsp_sub_v2r8(rinv10,sh_ewald),velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq10,rinv10),_fjsp_sub_v2r8(rinvsq10,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq10,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq20,rcutoff2))
+            {
+
+            r20              = _fjsp_mul_v2r8(rsq20,rinv20);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _fjsp_mul_v2r8(iq2,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r20,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq20,_fjsp_sub_v2r8(_fjsp_sub_v2r8(rinv20,sh_ewald),velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq20,rinv20),_fjsp_sub_v2r8(rinvsq20,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq20,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq30,rcutoff2))
+            {
+
+            r30              = _fjsp_mul_v2r8(rsq30,rinv30);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq30             = _fjsp_mul_v2r8(iq3,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r30,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq30,_fjsp_sub_v2r8(_fjsp_sub_v2r8(rinv30,sh_ewald),velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq30,rinv30),_fjsp_sub_v2r8(rinvsq30,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq30,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx30,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy30,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz30,fscal,fiz3);
+            
+            fjx0             = _fjsp_madd_v2r8(dx30,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy30,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz30,fscal,fjz0);
+
+            }
+
+            gmx_fjsp_decrement_1rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 150 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+            dx30             = _fjsp_sub_v2r8(ix3,jx0);
+            dy30             = _fjsp_sub_v2r8(iy3,jy0);
+            dz30             = _fjsp_sub_v2r8(iz3,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+            rsq30            = gmx_fjsp_calc_rsq_v2r8(dx30,dy30,dz30);
+
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+            rinv30           = gmx_fjsp_invsqrt_v2r8(rsq30);
+
+            rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+            rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+            rinvsq30         = _fjsp_mul_v2r8(rinv30,rinv30);
+
+            /* Load parameters for j particles */
+            jq0              = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),charge+jnrA+0);
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq10,rcutoff2))
+            {
+
+            r10              = _fjsp_mul_v2r8(rsq10,rinv10);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _fjsp_mul_v2r8(iq1,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r10,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq10,_fjsp_sub_v2r8(_fjsp_sub_v2r8(rinv10,sh_ewald),velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq10,rinv10),_fjsp_sub_v2r8(rinvsq10,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq10,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq20,rcutoff2))
+            {
+
+            r20              = _fjsp_mul_v2r8(rsq20,rinv20);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _fjsp_mul_v2r8(iq2,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r20,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq20,_fjsp_sub_v2r8(_fjsp_sub_v2r8(rinv20,sh_ewald),velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq20,rinv20),_fjsp_sub_v2r8(rinvsq20,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq20,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq30,rcutoff2))
+            {
+
+            r30              = _fjsp_mul_v2r8(rsq30,rinv30);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq30             = _fjsp_mul_v2r8(iq3,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r30,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq30,_fjsp_sub_v2r8(_fjsp_sub_v2r8(rinv30,sh_ewald),velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq30,rinv30),_fjsp_sub_v2r8(rinvsq30,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq30,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx30,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy30,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz30,fscal,fiz3);
+            
+            fjx0             = _fjsp_madd_v2r8(dx30,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy30,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz30,fscal,fjz0);
+
+            }
+
+            gmx_fjsp_decrement_1rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 150 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_fjsp_update_iforce_3atom_swizzle_v2r8(fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+                                              f+i_coord_offset+DIM,fshift+i_shift_offset);
+
+        ggid                        = gid[iidx];
+        /* Update potential energies */
+        gmx_fjsp_update_1pot_v2r8(velecsum,kernel_data->energygrp_elec+ggid);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 19 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W4_VF,outeriter*19 + inneriter*150);
+}
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecEwSh_VdwNone_GeomW4P1_F_sparc64_hpc_ace_double
+ * Electrostatics interaction: Ewald
+ * VdW interaction:            None
+ * Geometry:                   Water4-Particle
+ * Calculate force/pot:        Force
+ */
+void
+nb_kernel_ElecEwSh_VdwNone_GeomW4P1_F_sparc64_hpc_ace_double
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset1;
+    _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwioffset3;
+    _fjsp_v2r8       ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+    int              vdwjidx0A,vdwjidx0B;
+    _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    _fjsp_v2r8       dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    _fjsp_v2r8       dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    _fjsp_v2r8       dx30,dy30,dz30,rsq30,rinv30,rinvsq30,r30,qq30,c6_30,c12_30;
+    _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    _fjsp_v2r8       ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+    real             *ewtab;
+    _fjsp_v2r8       itab_tmp;
+    _fjsp_v2r8       dummy_mask,cutoff_mask;
+    _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+    _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+    union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+    charge           = mdatoms->chargeA;
+
+    sh_ewald         = gmx_fjsp_set1_v2r8(fr->ic->sh_ewald);
+    ewtab            = fr->ic->tabq_coul_F;
+    ewtabscale       = gmx_fjsp_set1_v2r8(fr->ic->tabq_scale);
+    ewtabhalfspace   = gmx_fjsp_set1_v2r8(0.5/fr->ic->tabq_scale);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+    iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+    iq3              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+3]));
+
+    /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+    rcutoff_scalar   = fr->rcoulomb;
+    rcutoff          = gmx_fjsp_set1_v2r8(rcutoff_scalar);
+    rcutoff2         = _fjsp_mul_v2r8(rcutoff,rcutoff);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_fjsp_load_shift_and_3rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset+DIM,
+                                                 &ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
+
+        fix1             = _fjsp_setzero_v2r8();
+        fiy1             = _fjsp_setzero_v2r8();
+        fiz1             = _fjsp_setzero_v2r8();
+        fix2             = _fjsp_setzero_v2r8();
+        fiy2             = _fjsp_setzero_v2r8();
+        fiz2             = _fjsp_setzero_v2r8();
+        fix3             = _fjsp_setzero_v2r8();
+        fiy3             = _fjsp_setzero_v2r8();
+        fiz3             = _fjsp_setzero_v2r8();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+            dx30             = _fjsp_sub_v2r8(ix3,jx0);
+            dy30             = _fjsp_sub_v2r8(iy3,jy0);
+            dz30             = _fjsp_sub_v2r8(iz3,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+            rsq30            = gmx_fjsp_calc_rsq_v2r8(dx30,dy30,dz30);
+
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+            rinv30           = gmx_fjsp_invsqrt_v2r8(rsq30);
+
+            rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+            rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+            rinvsq30         = _fjsp_mul_v2r8(rinv30,rinv30);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_fjsp_load_2real_swizzle_v2r8(charge+jnrA+0,charge+jnrB+0);
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq10,rcutoff2))
+            {
+
+            r10              = _fjsp_mul_v2r8(rsq10,rinv10);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _fjsp_mul_v2r8(iq1,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r10,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
+                                         &ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq10,rinv10),_fjsp_sub_v2r8(rinvsq10,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq10,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq20,rcutoff2))
+            {
+
+            r20              = _fjsp_mul_v2r8(rsq20,rinv20);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _fjsp_mul_v2r8(iq2,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r20,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
+                                         &ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq20,rinv20),_fjsp_sub_v2r8(rinvsq20,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq20,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq30,rcutoff2))
+            {
+
+            r30              = _fjsp_mul_v2r8(rsq30,rinv30);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq30             = _fjsp_mul_v2r8(iq3,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r30,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
+                                         &ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq30,rinv30),_fjsp_sub_v2r8(rinvsq30,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq30,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx30,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy30,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz30,fscal,fiz3);
+            
+            fjx0             = _fjsp_madd_v2r8(dx30,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy30,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz30,fscal,fjz0);
+
+            }
+
+            gmx_fjsp_decrement_1rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 129 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+            dx30             = _fjsp_sub_v2r8(ix3,jx0);
+            dy30             = _fjsp_sub_v2r8(iy3,jy0);
+            dz30             = _fjsp_sub_v2r8(iz3,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+            rsq30            = gmx_fjsp_calc_rsq_v2r8(dx30,dy30,dz30);
+
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+            rinv30           = gmx_fjsp_invsqrt_v2r8(rsq30);
+
+            rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+            rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+            rinvsq30         = _fjsp_mul_v2r8(rinv30,rinv30);
+
+            /* Load parameters for j particles */
+            jq0              = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),charge+jnrA+0);
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq10,rcutoff2))
+            {
+
+            r10              = _fjsp_mul_v2r8(rsq10,rinv10);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _fjsp_mul_v2r8(iq1,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r10,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq10,rinv10),_fjsp_sub_v2r8(rinvsq10,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq10,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq20,rcutoff2))
+            {
+
+            r20              = _fjsp_mul_v2r8(rsq20,rinv20);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _fjsp_mul_v2r8(iq2,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r20,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq20,rinv20),_fjsp_sub_v2r8(rinvsq20,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq20,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq30,rcutoff2))
+            {
+
+            r30              = _fjsp_mul_v2r8(rsq30,rinv30);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq30             = _fjsp_mul_v2r8(iq3,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r30,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq30,rinv30),_fjsp_sub_v2r8(rinvsq30,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq30,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx30,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy30,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz30,fscal,fiz3);
+            
+            fjx0             = _fjsp_madd_v2r8(dx30,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy30,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz30,fscal,fjz0);
+
+            }
+
+            gmx_fjsp_decrement_1rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 129 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_fjsp_update_iforce_3atom_swizzle_v2r8(fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+                                              f+i_coord_offset+DIM,fshift+i_shift_offset);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 18 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W4_F,outeriter*18 + inneriter*129);
+}
diff --git a/src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecEwSh_VdwNone_GeomW4W4_sparc64_hpc_ace_double.c b/src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecEwSh_VdwNone_GeomW4W4_sparc64_hpc_ace_double.c
new file mode 100644 (file)
index 0000000..b89e62b
--- /dev/null
@@ -0,0 +1,2341 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2012, by the GROMACS development team, led by
+ * David van der Spoel, Berk Hess, Erik Lindahl, and including many
+ * others, as listed in the AUTHORS file in the top-level source
+ * directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*
+ * Note: this file was generated by the GROMACS sparc64_hpc_ace_double kernel generator.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "kernelutil_sparc64_hpc_ace_double.h"
+
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecEwSh_VdwNone_GeomW4W4_VF_sparc64_hpc_ace_double
+ * Electrostatics interaction: Ewald
+ * VdW interaction:            None
+ * Geometry:                   Water4-Water4
+ * Calculate force/pot:        PotentialAndForce
+ */
+void
+nb_kernel_ElecEwSh_VdwNone_GeomW4W4_VF_sparc64_hpc_ace_double
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset1;
+    _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwioffset3;
+    _fjsp_v2r8       ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+    int              vdwjidx1A,vdwjidx1B;
+    _fjsp_v2r8       jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+    int              vdwjidx2A,vdwjidx2B;
+    _fjsp_v2r8       jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+    int              vdwjidx3A,vdwjidx3B;
+    _fjsp_v2r8       jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
+    _fjsp_v2r8       dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+    _fjsp_v2r8       dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+    _fjsp_v2r8       dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
+    _fjsp_v2r8       dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+    _fjsp_v2r8       dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+    _fjsp_v2r8       dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
+    _fjsp_v2r8       dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
+    _fjsp_v2r8       dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
+    _fjsp_v2r8       dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
+    _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    _fjsp_v2r8       ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+    real             *ewtab;
+    _fjsp_v2r8       itab_tmp;
+    _fjsp_v2r8       dummy_mask,cutoff_mask;
+    _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+    _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+    union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+    charge           = mdatoms->chargeA;
+
+    sh_ewald         = gmx_fjsp_set1_v2r8(fr->ic->sh_ewald);
+    ewtab            = fr->ic->tabq_coul_FDV0;
+    ewtabscale       = gmx_fjsp_set1_v2r8(fr->ic->tabq_scale);
+    ewtabhalfspace   = gmx_fjsp_set1_v2r8(0.5/fr->ic->tabq_scale);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+    iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+    iq3              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+3]));
+
+    jq1              = gmx_fjsp_set1_v2r8(charge[inr+1]);
+    jq2              = gmx_fjsp_set1_v2r8(charge[inr+2]);
+    jq3              = gmx_fjsp_set1_v2r8(charge[inr+3]);
+    qq11             = _fjsp_mul_v2r8(iq1,jq1);
+    qq12             = _fjsp_mul_v2r8(iq1,jq2);
+    qq13             = _fjsp_mul_v2r8(iq1,jq3);
+    qq21             = _fjsp_mul_v2r8(iq2,jq1);
+    qq22             = _fjsp_mul_v2r8(iq2,jq2);
+    qq23             = _fjsp_mul_v2r8(iq2,jq3);
+    qq31             = _fjsp_mul_v2r8(iq3,jq1);
+    qq32             = _fjsp_mul_v2r8(iq3,jq2);
+    qq33             = _fjsp_mul_v2r8(iq3,jq3);
+
+    /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+    rcutoff_scalar   = fr->rcoulomb;
+    rcutoff          = gmx_fjsp_set1_v2r8(rcutoff_scalar);
+    rcutoff2         = _fjsp_mul_v2r8(rcutoff,rcutoff);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_fjsp_load_shift_and_3rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset+DIM,
+                                                 &ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
+
+        fix1             = _fjsp_setzero_v2r8();
+        fiy1             = _fjsp_setzero_v2r8();
+        fiz1             = _fjsp_setzero_v2r8();
+        fix2             = _fjsp_setzero_v2r8();
+        fiy2             = _fjsp_setzero_v2r8();
+        fiz2             = _fjsp_setzero_v2r8();
+        fix3             = _fjsp_setzero_v2r8();
+        fiy3             = _fjsp_setzero_v2r8();
+        fiz3             = _fjsp_setzero_v2r8();
+
+        /* Reset potential sums */
+        velecsum         = _fjsp_setzero_v2r8();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_3rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA+DIM,x+j_coord_offsetB+DIM,
+                                              &jx1,&jy1,&jz1,&jx2,&jy2,&jz2,&jx3,&jy3,&jz3);
+
+            /* Calculate displacement vector */
+            dx11             = _fjsp_sub_v2r8(ix1,jx1);
+            dy11             = _fjsp_sub_v2r8(iy1,jy1);
+            dz11             = _fjsp_sub_v2r8(iz1,jz1);
+            dx12             = _fjsp_sub_v2r8(ix1,jx2);
+            dy12             = _fjsp_sub_v2r8(iy1,jy2);
+            dz12             = _fjsp_sub_v2r8(iz1,jz2);
+            dx13             = _fjsp_sub_v2r8(ix1,jx3);
+            dy13             = _fjsp_sub_v2r8(iy1,jy3);
+            dz13             = _fjsp_sub_v2r8(iz1,jz3);
+            dx21             = _fjsp_sub_v2r8(ix2,jx1);
+            dy21             = _fjsp_sub_v2r8(iy2,jy1);
+            dz21             = _fjsp_sub_v2r8(iz2,jz1);
+            dx22             = _fjsp_sub_v2r8(ix2,jx2);
+            dy22             = _fjsp_sub_v2r8(iy2,jy2);
+            dz22             = _fjsp_sub_v2r8(iz2,jz2);
+            dx23             = _fjsp_sub_v2r8(ix2,jx3);
+            dy23             = _fjsp_sub_v2r8(iy2,jy3);
+            dz23             = _fjsp_sub_v2r8(iz2,jz3);
+            dx31             = _fjsp_sub_v2r8(ix3,jx1);
+            dy31             = _fjsp_sub_v2r8(iy3,jy1);
+            dz31             = _fjsp_sub_v2r8(iz3,jz1);
+            dx32             = _fjsp_sub_v2r8(ix3,jx2);
+            dy32             = _fjsp_sub_v2r8(iy3,jy2);
+            dz32             = _fjsp_sub_v2r8(iz3,jz2);
+            dx33             = _fjsp_sub_v2r8(ix3,jx3);
+            dy33             = _fjsp_sub_v2r8(iy3,jy3);
+            dz33             = _fjsp_sub_v2r8(iz3,jz3);
+
+            /* Calculate squared distance and things based on it */
+            rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+            rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+            rsq13            = gmx_fjsp_calc_rsq_v2r8(dx13,dy13,dz13);
+            rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+            rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+            rsq23            = gmx_fjsp_calc_rsq_v2r8(dx23,dy23,dz23);
+            rsq31            = gmx_fjsp_calc_rsq_v2r8(dx31,dy31,dz31);
+            rsq32            = gmx_fjsp_calc_rsq_v2r8(dx32,dy32,dz32);
+            rsq33            = gmx_fjsp_calc_rsq_v2r8(dx33,dy33,dz33);
+
+            rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+            rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+            rinv13           = gmx_fjsp_invsqrt_v2r8(rsq13);
+            rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+            rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+            rinv23           = gmx_fjsp_invsqrt_v2r8(rsq23);
+            rinv31           = gmx_fjsp_invsqrt_v2r8(rsq31);
+            rinv32           = gmx_fjsp_invsqrt_v2r8(rsq32);
+            rinv33           = gmx_fjsp_invsqrt_v2r8(rsq33);
+
+            rinvsq11         = _fjsp_mul_v2r8(rinv11,rinv11);
+            rinvsq12         = _fjsp_mul_v2r8(rinv12,rinv12);
+            rinvsq13         = _fjsp_mul_v2r8(rinv13,rinv13);
+            rinvsq21         = _fjsp_mul_v2r8(rinv21,rinv21);
+            rinvsq22         = _fjsp_mul_v2r8(rinv22,rinv22);
+            rinvsq23         = _fjsp_mul_v2r8(rinv23,rinv23);
+            rinvsq31         = _fjsp_mul_v2r8(rinv31,rinv31);
+            rinvsq32         = _fjsp_mul_v2r8(rinv32,rinv32);
+            rinvsq33         = _fjsp_mul_v2r8(rinv33,rinv33);
+
+            fjx1             = _fjsp_setzero_v2r8();
+            fjy1             = _fjsp_setzero_v2r8();
+            fjz1             = _fjsp_setzero_v2r8();
+            fjx2             = _fjsp_setzero_v2r8();
+            fjy2             = _fjsp_setzero_v2r8();
+            fjz2             = _fjsp_setzero_v2r8();
+            fjx3             = _fjsp_setzero_v2r8();
+            fjy3             = _fjsp_setzero_v2r8();
+            fjz3             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq11,rcutoff2))
+            {
+
+            r11              = _fjsp_mul_v2r8(rsq11,rinv11);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r11,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq11,_fjsp_sub_v2r8(_fjsp_sub_v2r8(rinv11,sh_ewald),velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq11,rinv11),_fjsp_sub_v2r8(rinvsq11,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq11,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+            
+            fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq12,rcutoff2))
+            {
+
+            r12              = _fjsp_mul_v2r8(rsq12,rinv12);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r12,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq12,_fjsp_sub_v2r8(_fjsp_sub_v2r8(rinv12,sh_ewald),velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq12,rinv12),_fjsp_sub_v2r8(rinvsq12,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq12,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+            
+            fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq13,rcutoff2))
+            {
+
+            r13              = _fjsp_mul_v2r8(rsq13,rinv13);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r13,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq13,_fjsp_sub_v2r8(_fjsp_sub_v2r8(rinv13,sh_ewald),velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq13,rinv13),_fjsp_sub_v2r8(rinvsq13,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq13,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx13,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy13,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz13,fscal,fiz1);
+            
+            fjx3             = _fjsp_madd_v2r8(dx13,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy13,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz13,fscal,fjz3);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq21,rcutoff2))
+            {
+
+            r21              = _fjsp_mul_v2r8(rsq21,rinv21);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r21,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq21,_fjsp_sub_v2r8(_fjsp_sub_v2r8(rinv21,sh_ewald),velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq21,rinv21),_fjsp_sub_v2r8(rinvsq21,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq21,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+            
+            fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq22,rcutoff2))
+            {
+
+            r22              = _fjsp_mul_v2r8(rsq22,rinv22);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r22,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq22,_fjsp_sub_v2r8(_fjsp_sub_v2r8(rinv22,sh_ewald),velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq22,rinv22),_fjsp_sub_v2r8(rinvsq22,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq22,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+            
+            fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq23,rcutoff2))
+            {
+
+            r23              = _fjsp_mul_v2r8(rsq23,rinv23);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r23,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq23,_fjsp_sub_v2r8(_fjsp_sub_v2r8(rinv23,sh_ewald),velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq23,rinv23),_fjsp_sub_v2r8(rinvsq23,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq23,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx23,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy23,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz23,fscal,fiz2);
+            
+            fjx3             = _fjsp_madd_v2r8(dx23,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy23,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz23,fscal,fjz3);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq31,rcutoff2))
+            {
+
+            r31              = _fjsp_mul_v2r8(rsq31,rinv31);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r31,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq31,_fjsp_sub_v2r8(_fjsp_sub_v2r8(rinv31,sh_ewald),velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq31,rinv31),_fjsp_sub_v2r8(rinvsq31,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq31,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx31,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy31,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz31,fscal,fiz3);
+            
+            fjx1             = _fjsp_madd_v2r8(dx31,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy31,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz31,fscal,fjz1);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq32,rcutoff2))
+            {
+
+            r32              = _fjsp_mul_v2r8(rsq32,rinv32);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r32,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq32,_fjsp_sub_v2r8(_fjsp_sub_v2r8(rinv32,sh_ewald),velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq32,rinv32),_fjsp_sub_v2r8(rinvsq32,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq32,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx32,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy32,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz32,fscal,fiz3);
+            
+            fjx2             = _fjsp_madd_v2r8(dx32,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy32,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz32,fscal,fjz2);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq33,rcutoff2))
+            {
+
+            r33              = _fjsp_mul_v2r8(rsq33,rinv33);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r33,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq33,_fjsp_sub_v2r8(_fjsp_sub_v2r8(rinv33,sh_ewald),velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq33,rinv33),_fjsp_sub_v2r8(rinvsq33,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq33,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx33,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy33,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz33,fscal,fiz3);
+            
+            fjx3             = _fjsp_madd_v2r8(dx33,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy33,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz33,fscal,fjz3);
+
+            }
+
+            gmx_fjsp_decrement_3rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA+DIM,f+j_coord_offsetB+DIM,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+
+            /* Inner loop uses 441 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_3rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA+DIM,
+                                              &jx1,&jy1,&jz1,&jx2,&jy2,&jz2,&jx3,&jy3,&jz3);
+
+            /* Calculate displacement vector */
+            dx11             = _fjsp_sub_v2r8(ix1,jx1);
+            dy11             = _fjsp_sub_v2r8(iy1,jy1);
+            dz11             = _fjsp_sub_v2r8(iz1,jz1);
+            dx12             = _fjsp_sub_v2r8(ix1,jx2);
+            dy12             = _fjsp_sub_v2r8(iy1,jy2);
+            dz12             = _fjsp_sub_v2r8(iz1,jz2);
+            dx13             = _fjsp_sub_v2r8(ix1,jx3);
+            dy13             = _fjsp_sub_v2r8(iy1,jy3);
+            dz13             = _fjsp_sub_v2r8(iz1,jz3);
+            dx21             = _fjsp_sub_v2r8(ix2,jx1);
+            dy21             = _fjsp_sub_v2r8(iy2,jy1);
+            dz21             = _fjsp_sub_v2r8(iz2,jz1);
+            dx22             = _fjsp_sub_v2r8(ix2,jx2);
+            dy22             = _fjsp_sub_v2r8(iy2,jy2);
+            dz22             = _fjsp_sub_v2r8(iz2,jz2);
+            dx23             = _fjsp_sub_v2r8(ix2,jx3);
+            dy23             = _fjsp_sub_v2r8(iy2,jy3);
+            dz23             = _fjsp_sub_v2r8(iz2,jz3);
+            dx31             = _fjsp_sub_v2r8(ix3,jx1);
+            dy31             = _fjsp_sub_v2r8(iy3,jy1);
+            dz31             = _fjsp_sub_v2r8(iz3,jz1);
+            dx32             = _fjsp_sub_v2r8(ix3,jx2);
+            dy32             = _fjsp_sub_v2r8(iy3,jy2);
+            dz32             = _fjsp_sub_v2r8(iz3,jz2);
+            dx33             = _fjsp_sub_v2r8(ix3,jx3);
+            dy33             = _fjsp_sub_v2r8(iy3,jy3);
+            dz33             = _fjsp_sub_v2r8(iz3,jz3);
+
+            /* Calculate squared distance and things based on it */
+            rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+            rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+            rsq13            = gmx_fjsp_calc_rsq_v2r8(dx13,dy13,dz13);
+            rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+            rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+            rsq23            = gmx_fjsp_calc_rsq_v2r8(dx23,dy23,dz23);
+            rsq31            = gmx_fjsp_calc_rsq_v2r8(dx31,dy31,dz31);
+            rsq32            = gmx_fjsp_calc_rsq_v2r8(dx32,dy32,dz32);
+            rsq33            = gmx_fjsp_calc_rsq_v2r8(dx33,dy33,dz33);
+
+            rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+            rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+            rinv13           = gmx_fjsp_invsqrt_v2r8(rsq13);
+            rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+            rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+            rinv23           = gmx_fjsp_invsqrt_v2r8(rsq23);
+            rinv31           = gmx_fjsp_invsqrt_v2r8(rsq31);
+            rinv32           = gmx_fjsp_invsqrt_v2r8(rsq32);
+            rinv33           = gmx_fjsp_invsqrt_v2r8(rsq33);
+
+            rinvsq11         = _fjsp_mul_v2r8(rinv11,rinv11);
+            rinvsq12         = _fjsp_mul_v2r8(rinv12,rinv12);
+            rinvsq13         = _fjsp_mul_v2r8(rinv13,rinv13);
+            rinvsq21         = _fjsp_mul_v2r8(rinv21,rinv21);
+            rinvsq22         = _fjsp_mul_v2r8(rinv22,rinv22);
+            rinvsq23         = _fjsp_mul_v2r8(rinv23,rinv23);
+            rinvsq31         = _fjsp_mul_v2r8(rinv31,rinv31);
+            rinvsq32         = _fjsp_mul_v2r8(rinv32,rinv32);
+            rinvsq33         = _fjsp_mul_v2r8(rinv33,rinv33);
+
+            fjx1             = _fjsp_setzero_v2r8();
+            fjy1             = _fjsp_setzero_v2r8();
+            fjz1             = _fjsp_setzero_v2r8();
+            fjx2             = _fjsp_setzero_v2r8();
+            fjy2             = _fjsp_setzero_v2r8();
+            fjz2             = _fjsp_setzero_v2r8();
+            fjx3             = _fjsp_setzero_v2r8();
+            fjy3             = _fjsp_setzero_v2r8();
+            fjz3             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq11,rcutoff2))
+            {
+
+            r11              = _fjsp_mul_v2r8(rsq11,rinv11);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r11,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq11,_fjsp_sub_v2r8(_fjsp_sub_v2r8(rinv11,sh_ewald),velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq11,rinv11),_fjsp_sub_v2r8(rinvsq11,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq11,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+            
+            fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq12,rcutoff2))
+            {
+
+            r12              = _fjsp_mul_v2r8(rsq12,rinv12);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r12,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq12,_fjsp_sub_v2r8(_fjsp_sub_v2r8(rinv12,sh_ewald),velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq12,rinv12),_fjsp_sub_v2r8(rinvsq12,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq12,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+            
+            fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq13,rcutoff2))
+            {
+
+            r13              = _fjsp_mul_v2r8(rsq13,rinv13);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r13,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq13,_fjsp_sub_v2r8(_fjsp_sub_v2r8(rinv13,sh_ewald),velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq13,rinv13),_fjsp_sub_v2r8(rinvsq13,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq13,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx13,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy13,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz13,fscal,fiz1);
+            
+            fjx3             = _fjsp_madd_v2r8(dx13,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy13,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz13,fscal,fjz3);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq21,rcutoff2))
+            {
+
+            r21              = _fjsp_mul_v2r8(rsq21,rinv21);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r21,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq21,_fjsp_sub_v2r8(_fjsp_sub_v2r8(rinv21,sh_ewald),velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq21,rinv21),_fjsp_sub_v2r8(rinvsq21,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq21,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+            
+            fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq22,rcutoff2))
+            {
+
+            r22              = _fjsp_mul_v2r8(rsq22,rinv22);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r22,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq22,_fjsp_sub_v2r8(_fjsp_sub_v2r8(rinv22,sh_ewald),velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq22,rinv22),_fjsp_sub_v2r8(rinvsq22,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq22,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+            
+            fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq23,rcutoff2))
+            {
+
+            r23              = _fjsp_mul_v2r8(rsq23,rinv23);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r23,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq23,_fjsp_sub_v2r8(_fjsp_sub_v2r8(rinv23,sh_ewald),velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq23,rinv23),_fjsp_sub_v2r8(rinvsq23,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq23,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx23,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy23,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz23,fscal,fiz2);
+            
+            fjx3             = _fjsp_madd_v2r8(dx23,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy23,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz23,fscal,fjz3);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq31,rcutoff2))
+            {
+
+            r31              = _fjsp_mul_v2r8(rsq31,rinv31);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r31,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq31,_fjsp_sub_v2r8(_fjsp_sub_v2r8(rinv31,sh_ewald),velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq31,rinv31),_fjsp_sub_v2r8(rinvsq31,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq31,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx31,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy31,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz31,fscal,fiz3);
+            
+            fjx1             = _fjsp_madd_v2r8(dx31,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy31,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz31,fscal,fjz1);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq32,rcutoff2))
+            {
+
+            r32              = _fjsp_mul_v2r8(rsq32,rinv32);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r32,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq32,_fjsp_sub_v2r8(_fjsp_sub_v2r8(rinv32,sh_ewald),velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq32,rinv32),_fjsp_sub_v2r8(rinvsq32,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq32,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx32,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy32,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz32,fscal,fiz3);
+            
+            fjx2             = _fjsp_madd_v2r8(dx32,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy32,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz32,fscal,fjz2);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq33,rcutoff2))
+            {
+
+            r33              = _fjsp_mul_v2r8(rsq33,rinv33);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r33,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq33,_fjsp_sub_v2r8(_fjsp_sub_v2r8(rinv33,sh_ewald),velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq33,rinv33),_fjsp_sub_v2r8(rinvsq33,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq33,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx33,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy33,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz33,fscal,fiz3);
+            
+            fjx3             = _fjsp_madd_v2r8(dx33,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy33,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz33,fscal,fjz3);
+
+            }
+
+            gmx_fjsp_decrement_3rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA+DIM,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+
+            /* Inner loop uses 441 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_fjsp_update_iforce_3atom_swizzle_v2r8(fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+                                              f+i_coord_offset+DIM,fshift+i_shift_offset);
+
+        ggid                        = gid[iidx];
+        /* Update potential energies */
+        gmx_fjsp_update_1pot_v2r8(velecsum,kernel_data->energygrp_elec+ggid);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 19 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W4W4_VF,outeriter*19 + inneriter*441);
+}
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecEwSh_VdwNone_GeomW4W4_F_sparc64_hpc_ace_double
+ * Electrostatics interaction: Ewald
+ * VdW interaction:            None
+ * Geometry:                   Water4-Water4
+ * Calculate force/pot:        Force
+ */
+void
+nb_kernel_ElecEwSh_VdwNone_GeomW4W4_F_sparc64_hpc_ace_double
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset1;
+    _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwioffset3;
+    _fjsp_v2r8       ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+    int              vdwjidx1A,vdwjidx1B;
+    _fjsp_v2r8       jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+    int              vdwjidx2A,vdwjidx2B;
+    _fjsp_v2r8       jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+    int              vdwjidx3A,vdwjidx3B;
+    _fjsp_v2r8       jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
+    _fjsp_v2r8       dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+    _fjsp_v2r8       dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+    _fjsp_v2r8       dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
+    _fjsp_v2r8       dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+    _fjsp_v2r8       dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+    _fjsp_v2r8       dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
+    _fjsp_v2r8       dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
+    _fjsp_v2r8       dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
+    _fjsp_v2r8       dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
+    _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    _fjsp_v2r8       ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+    real             *ewtab;
+    _fjsp_v2r8       itab_tmp;
+    _fjsp_v2r8       dummy_mask,cutoff_mask;
+    _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+    _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+    union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+    charge           = mdatoms->chargeA;
+
+    sh_ewald         = gmx_fjsp_set1_v2r8(fr->ic->sh_ewald);
+    ewtab            = fr->ic->tabq_coul_F;
+    ewtabscale       = gmx_fjsp_set1_v2r8(fr->ic->tabq_scale);
+    ewtabhalfspace   = gmx_fjsp_set1_v2r8(0.5/fr->ic->tabq_scale);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+    iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+    iq3              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+3]));
+
+    jq1              = gmx_fjsp_set1_v2r8(charge[inr+1]);
+    jq2              = gmx_fjsp_set1_v2r8(charge[inr+2]);
+    jq3              = gmx_fjsp_set1_v2r8(charge[inr+3]);
+    qq11             = _fjsp_mul_v2r8(iq1,jq1);
+    qq12             = _fjsp_mul_v2r8(iq1,jq2);
+    qq13             = _fjsp_mul_v2r8(iq1,jq3);
+    qq21             = _fjsp_mul_v2r8(iq2,jq1);
+    qq22             = _fjsp_mul_v2r8(iq2,jq2);
+    qq23             = _fjsp_mul_v2r8(iq2,jq3);
+    qq31             = _fjsp_mul_v2r8(iq3,jq1);
+    qq32             = _fjsp_mul_v2r8(iq3,jq2);
+    qq33             = _fjsp_mul_v2r8(iq3,jq3);
+
+    /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+    rcutoff_scalar   = fr->rcoulomb;
+    rcutoff          = gmx_fjsp_set1_v2r8(rcutoff_scalar);
+    rcutoff2         = _fjsp_mul_v2r8(rcutoff,rcutoff);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_fjsp_load_shift_and_3rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset+DIM,
+                                                 &ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
+
+        fix1             = _fjsp_setzero_v2r8();
+        fiy1             = _fjsp_setzero_v2r8();
+        fiz1             = _fjsp_setzero_v2r8();
+        fix2             = _fjsp_setzero_v2r8();
+        fiy2             = _fjsp_setzero_v2r8();
+        fiz2             = _fjsp_setzero_v2r8();
+        fix3             = _fjsp_setzero_v2r8();
+        fiy3             = _fjsp_setzero_v2r8();
+        fiz3             = _fjsp_setzero_v2r8();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_3rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA+DIM,x+j_coord_offsetB+DIM,
+                                              &jx1,&jy1,&jz1,&jx2,&jy2,&jz2,&jx3,&jy3,&jz3);
+
+            /* Calculate displacement vector */
+            dx11             = _fjsp_sub_v2r8(ix1,jx1);
+            dy11             = _fjsp_sub_v2r8(iy1,jy1);
+            dz11             = _fjsp_sub_v2r8(iz1,jz1);
+            dx12             = _fjsp_sub_v2r8(ix1,jx2);
+            dy12             = _fjsp_sub_v2r8(iy1,jy2);
+            dz12             = _fjsp_sub_v2r8(iz1,jz2);
+            dx13             = _fjsp_sub_v2r8(ix1,jx3);
+            dy13             = _fjsp_sub_v2r8(iy1,jy3);
+            dz13             = _fjsp_sub_v2r8(iz1,jz3);
+            dx21             = _fjsp_sub_v2r8(ix2,jx1);
+            dy21             = _fjsp_sub_v2r8(iy2,jy1);
+            dz21             = _fjsp_sub_v2r8(iz2,jz1);
+            dx22             = _fjsp_sub_v2r8(ix2,jx2);
+            dy22             = _fjsp_sub_v2r8(iy2,jy2);
+            dz22             = _fjsp_sub_v2r8(iz2,jz2);
+            dx23             = _fjsp_sub_v2r8(ix2,jx3);
+            dy23             = _fjsp_sub_v2r8(iy2,jy3);
+            dz23             = _fjsp_sub_v2r8(iz2,jz3);
+            dx31             = _fjsp_sub_v2r8(ix3,jx1);
+            dy31             = _fjsp_sub_v2r8(iy3,jy1);
+            dz31             = _fjsp_sub_v2r8(iz3,jz1);
+            dx32             = _fjsp_sub_v2r8(ix3,jx2);
+            dy32             = _fjsp_sub_v2r8(iy3,jy2);
+            dz32             = _fjsp_sub_v2r8(iz3,jz2);
+            dx33             = _fjsp_sub_v2r8(ix3,jx3);
+            dy33             = _fjsp_sub_v2r8(iy3,jy3);
+            dz33             = _fjsp_sub_v2r8(iz3,jz3);
+
+            /* Calculate squared distance and things based on it */
+            rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+            rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+            rsq13            = gmx_fjsp_calc_rsq_v2r8(dx13,dy13,dz13);
+            rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+            rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+            rsq23            = gmx_fjsp_calc_rsq_v2r8(dx23,dy23,dz23);
+            rsq31            = gmx_fjsp_calc_rsq_v2r8(dx31,dy31,dz31);
+            rsq32            = gmx_fjsp_calc_rsq_v2r8(dx32,dy32,dz32);
+            rsq33            = gmx_fjsp_calc_rsq_v2r8(dx33,dy33,dz33);
+
+            rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+            rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+            rinv13           = gmx_fjsp_invsqrt_v2r8(rsq13);
+            rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+            rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+            rinv23           = gmx_fjsp_invsqrt_v2r8(rsq23);
+            rinv31           = gmx_fjsp_invsqrt_v2r8(rsq31);
+            rinv32           = gmx_fjsp_invsqrt_v2r8(rsq32);
+            rinv33           = gmx_fjsp_invsqrt_v2r8(rsq33);
+
+            rinvsq11         = _fjsp_mul_v2r8(rinv11,rinv11);
+            rinvsq12         = _fjsp_mul_v2r8(rinv12,rinv12);
+            rinvsq13         = _fjsp_mul_v2r8(rinv13,rinv13);
+            rinvsq21         = _fjsp_mul_v2r8(rinv21,rinv21);
+            rinvsq22         = _fjsp_mul_v2r8(rinv22,rinv22);
+            rinvsq23         = _fjsp_mul_v2r8(rinv23,rinv23);
+            rinvsq31         = _fjsp_mul_v2r8(rinv31,rinv31);
+            rinvsq32         = _fjsp_mul_v2r8(rinv32,rinv32);
+            rinvsq33         = _fjsp_mul_v2r8(rinv33,rinv33);
+
+            fjx1             = _fjsp_setzero_v2r8();
+            fjy1             = _fjsp_setzero_v2r8();
+            fjz1             = _fjsp_setzero_v2r8();
+            fjx2             = _fjsp_setzero_v2r8();
+            fjy2             = _fjsp_setzero_v2r8();
+            fjz2             = _fjsp_setzero_v2r8();
+            fjx3             = _fjsp_setzero_v2r8();
+            fjy3             = _fjsp_setzero_v2r8();
+            fjz3             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq11,rcutoff2))
+            {
+
+            r11              = _fjsp_mul_v2r8(rsq11,rinv11);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r11,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
+                                         &ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq11,rinv11),_fjsp_sub_v2r8(rinvsq11,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq11,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+            
+            fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq12,rcutoff2))
+            {
+
+            r12              = _fjsp_mul_v2r8(rsq12,rinv12);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r12,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
+                                         &ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq12,rinv12),_fjsp_sub_v2r8(rinvsq12,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq12,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+            
+            fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq13,rcutoff2))
+            {
+
+            r13              = _fjsp_mul_v2r8(rsq13,rinv13);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r13,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
+                                         &ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq13,rinv13),_fjsp_sub_v2r8(rinvsq13,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq13,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx13,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy13,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz13,fscal,fiz1);
+            
+            fjx3             = _fjsp_madd_v2r8(dx13,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy13,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz13,fscal,fjz3);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq21,rcutoff2))
+            {
+
+            r21              = _fjsp_mul_v2r8(rsq21,rinv21);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r21,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
+                                         &ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq21,rinv21),_fjsp_sub_v2r8(rinvsq21,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq21,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+            
+            fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq22,rcutoff2))
+            {
+
+            r22              = _fjsp_mul_v2r8(rsq22,rinv22);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r22,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
+                                         &ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq22,rinv22),_fjsp_sub_v2r8(rinvsq22,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq22,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+            
+            fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq23,rcutoff2))
+            {
+
+            r23              = _fjsp_mul_v2r8(rsq23,rinv23);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r23,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
+                                         &ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq23,rinv23),_fjsp_sub_v2r8(rinvsq23,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq23,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx23,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy23,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz23,fscal,fiz2);
+            
+            fjx3             = _fjsp_madd_v2r8(dx23,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy23,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz23,fscal,fjz3);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq31,rcutoff2))
+            {
+
+            r31              = _fjsp_mul_v2r8(rsq31,rinv31);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r31,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
+                                         &ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq31,rinv31),_fjsp_sub_v2r8(rinvsq31,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq31,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx31,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy31,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz31,fscal,fiz3);
+            
+            fjx1             = _fjsp_madd_v2r8(dx31,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy31,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz31,fscal,fjz1);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq32,rcutoff2))
+            {
+
+            r32              = _fjsp_mul_v2r8(rsq32,rinv32);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r32,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
+                                         &ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq32,rinv32),_fjsp_sub_v2r8(rinvsq32,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq32,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx32,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy32,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz32,fscal,fiz3);
+            
+            fjx2             = _fjsp_madd_v2r8(dx32,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy32,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz32,fscal,fjz2);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq33,rcutoff2))
+            {
+
+            r33              = _fjsp_mul_v2r8(rsq33,rinv33);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r33,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
+                                         &ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq33,rinv33),_fjsp_sub_v2r8(rinvsq33,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq33,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx33,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy33,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz33,fscal,fiz3);
+            
+            fjx3             = _fjsp_madd_v2r8(dx33,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy33,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz33,fscal,fjz3);
+
+            }
+
+            gmx_fjsp_decrement_3rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA+DIM,f+j_coord_offsetB+DIM,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+
+            /* Inner loop uses 378 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_3rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA+DIM,
+                                              &jx1,&jy1,&jz1,&jx2,&jy2,&jz2,&jx3,&jy3,&jz3);
+
+            /* Calculate displacement vector */
+            dx11             = _fjsp_sub_v2r8(ix1,jx1);
+            dy11             = _fjsp_sub_v2r8(iy1,jy1);
+            dz11             = _fjsp_sub_v2r8(iz1,jz1);
+            dx12             = _fjsp_sub_v2r8(ix1,jx2);
+            dy12             = _fjsp_sub_v2r8(iy1,jy2);
+            dz12             = _fjsp_sub_v2r8(iz1,jz2);
+            dx13             = _fjsp_sub_v2r8(ix1,jx3);
+            dy13             = _fjsp_sub_v2r8(iy1,jy3);
+            dz13             = _fjsp_sub_v2r8(iz1,jz3);
+            dx21             = _fjsp_sub_v2r8(ix2,jx1);
+            dy21             = _fjsp_sub_v2r8(iy2,jy1);
+            dz21             = _fjsp_sub_v2r8(iz2,jz1);
+            dx22             = _fjsp_sub_v2r8(ix2,jx2);
+            dy22             = _fjsp_sub_v2r8(iy2,jy2);
+            dz22             = _fjsp_sub_v2r8(iz2,jz2);
+            dx23             = _fjsp_sub_v2r8(ix2,jx3);
+            dy23             = _fjsp_sub_v2r8(iy2,jy3);
+            dz23             = _fjsp_sub_v2r8(iz2,jz3);
+            dx31             = _fjsp_sub_v2r8(ix3,jx1);
+            dy31             = _fjsp_sub_v2r8(iy3,jy1);
+            dz31             = _fjsp_sub_v2r8(iz3,jz1);
+            dx32             = _fjsp_sub_v2r8(ix3,jx2);
+            dy32             = _fjsp_sub_v2r8(iy3,jy2);
+            dz32             = _fjsp_sub_v2r8(iz3,jz2);
+            dx33             = _fjsp_sub_v2r8(ix3,jx3);
+            dy33             = _fjsp_sub_v2r8(iy3,jy3);
+            dz33             = _fjsp_sub_v2r8(iz3,jz3);
+
+            /* Calculate squared distance and things based on it */
+            rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+            rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+            rsq13            = gmx_fjsp_calc_rsq_v2r8(dx13,dy13,dz13);
+            rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+            rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+            rsq23            = gmx_fjsp_calc_rsq_v2r8(dx23,dy23,dz23);
+            rsq31            = gmx_fjsp_calc_rsq_v2r8(dx31,dy31,dz31);
+            rsq32            = gmx_fjsp_calc_rsq_v2r8(dx32,dy32,dz32);
+            rsq33            = gmx_fjsp_calc_rsq_v2r8(dx33,dy33,dz33);
+
+            rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+            rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+            rinv13           = gmx_fjsp_invsqrt_v2r8(rsq13);
+            rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+            rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+            rinv23           = gmx_fjsp_invsqrt_v2r8(rsq23);
+            rinv31           = gmx_fjsp_invsqrt_v2r8(rsq31);
+            rinv32           = gmx_fjsp_invsqrt_v2r8(rsq32);
+            rinv33           = gmx_fjsp_invsqrt_v2r8(rsq33);
+
+            rinvsq11         = _fjsp_mul_v2r8(rinv11,rinv11);
+            rinvsq12         = _fjsp_mul_v2r8(rinv12,rinv12);
+            rinvsq13         = _fjsp_mul_v2r8(rinv13,rinv13);
+            rinvsq21         = _fjsp_mul_v2r8(rinv21,rinv21);
+            rinvsq22         = _fjsp_mul_v2r8(rinv22,rinv22);
+            rinvsq23         = _fjsp_mul_v2r8(rinv23,rinv23);
+            rinvsq31         = _fjsp_mul_v2r8(rinv31,rinv31);
+            rinvsq32         = _fjsp_mul_v2r8(rinv32,rinv32);
+            rinvsq33         = _fjsp_mul_v2r8(rinv33,rinv33);
+
+            fjx1             = _fjsp_setzero_v2r8();
+            fjy1             = _fjsp_setzero_v2r8();
+            fjz1             = _fjsp_setzero_v2r8();
+            fjx2             = _fjsp_setzero_v2r8();
+            fjy2             = _fjsp_setzero_v2r8();
+            fjz2             = _fjsp_setzero_v2r8();
+            fjx3             = _fjsp_setzero_v2r8();
+            fjy3             = _fjsp_setzero_v2r8();
+            fjz3             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq11,rcutoff2))
+            {
+
+            r11              = _fjsp_mul_v2r8(rsq11,rinv11);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r11,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq11,rinv11),_fjsp_sub_v2r8(rinvsq11,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq11,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+            
+            fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq12,rcutoff2))
+            {
+
+            r12              = _fjsp_mul_v2r8(rsq12,rinv12);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r12,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq12,rinv12),_fjsp_sub_v2r8(rinvsq12,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq12,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+            
+            fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq13,rcutoff2))
+            {
+
+            r13              = _fjsp_mul_v2r8(rsq13,rinv13);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r13,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq13,rinv13),_fjsp_sub_v2r8(rinvsq13,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq13,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx13,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy13,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz13,fscal,fiz1);
+            
+            fjx3             = _fjsp_madd_v2r8(dx13,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy13,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz13,fscal,fjz3);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq21,rcutoff2))
+            {
+
+            r21              = _fjsp_mul_v2r8(rsq21,rinv21);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r21,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq21,rinv21),_fjsp_sub_v2r8(rinvsq21,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq21,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+            
+            fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq22,rcutoff2))
+            {
+
+            r22              = _fjsp_mul_v2r8(rsq22,rinv22);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r22,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq22,rinv22),_fjsp_sub_v2r8(rinvsq22,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq22,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+            
+            fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq23,rcutoff2))
+            {
+
+            r23              = _fjsp_mul_v2r8(rsq23,rinv23);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r23,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq23,rinv23),_fjsp_sub_v2r8(rinvsq23,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq23,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx23,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy23,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz23,fscal,fiz2);
+            
+            fjx3             = _fjsp_madd_v2r8(dx23,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy23,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz23,fscal,fjz3);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq31,rcutoff2))
+            {
+
+            r31              = _fjsp_mul_v2r8(rsq31,rinv31);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r31,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq31,rinv31),_fjsp_sub_v2r8(rinvsq31,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq31,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx31,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy31,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz31,fscal,fiz3);
+            
+            fjx1             = _fjsp_madd_v2r8(dx31,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy31,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz31,fscal,fjz1);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq32,rcutoff2))
+            {
+
+            r32              = _fjsp_mul_v2r8(rsq32,rinv32);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r32,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq32,rinv32),_fjsp_sub_v2r8(rinvsq32,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq32,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx32,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy32,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz32,fscal,fiz3);
+            
+            fjx2             = _fjsp_madd_v2r8(dx32,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy32,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz32,fscal,fjz2);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq33,rcutoff2))
+            {
+
+            r33              = _fjsp_mul_v2r8(rsq33,rinv33);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r33,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq33,rinv33),_fjsp_sub_v2r8(rinvsq33,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq33,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx33,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy33,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz33,fscal,fiz3);
+            
+            fjx3             = _fjsp_madd_v2r8(dx33,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy33,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz33,fscal,fjz3);
+
+            }
+
+            gmx_fjsp_decrement_3rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA+DIM,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+
+            /* Inner loop uses 378 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_fjsp_update_iforce_3atom_swizzle_v2r8(fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+                                              f+i_coord_offset+DIM,fshift+i_shift_offset);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 18 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W4W4_F,outeriter*18 + inneriter*378);
+}
diff --git a/src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecEwSw_VdwLJSw_GeomP1P1_sparc64_hpc_ace_double.c b/src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecEwSw_VdwLJSw_GeomP1P1_sparc64_hpc_ace_double.c
new file mode 100644 (file)
index 0000000..0fb4e05
--- /dev/null
@@ -0,0 +1,759 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2012, by the GROMACS development team, led by
+ * David van der Spoel, Berk Hess, Erik Lindahl, and including many
+ * others, as listed in the AUTHORS file in the top-level source
+ * directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*
+ * Note: this file was generated by the GROMACS sparc64_hpc_ace_double kernel generator.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "kernelutil_sparc64_hpc_ace_double.h"
+
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecEwSw_VdwLJSw_GeomP1P1_VF_sparc64_hpc_ace_double
+ * Electrostatics interaction: Ewald
+ * VdW interaction:            LennardJones
+ * Geometry:                   Particle-Particle
+ * Calculate force/pot:        PotentialAndForce
+ */
+void
+nb_kernel_ElecEwSw_VdwLJSw_GeomP1P1_VF_sparc64_hpc_ace_double
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwjidx0A,vdwjidx0B;
+    _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
+    _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
+    _fjsp_v2r8       ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+    real             *ewtab;
+    _fjsp_v2r8       rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
+    real             rswitch_scalar,d_scalar;
+    _fjsp_v2r8       itab_tmp;
+    _fjsp_v2r8       dummy_mask,cutoff_mask;
+    _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+    _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+    union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+
+    sh_ewald         = gmx_fjsp_set1_v2r8(fr->ic->sh_ewald);
+    ewtab            = fr->ic->tabq_coul_FDV0;
+    ewtabscale       = gmx_fjsp_set1_v2r8(fr->ic->tabq_scale);
+    ewtabhalfspace   = gmx_fjsp_set1_v2r8(0.5/fr->ic->tabq_scale);
+
+    /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+    rcutoff_scalar   = fr->rcoulomb;
+    rcutoff          = gmx_fjsp_set1_v2r8(rcutoff_scalar);
+    rcutoff2         = _fjsp_mul_v2r8(rcutoff,rcutoff);
+
+    rswitch_scalar   = fr->rcoulomb_switch;
+    rswitch          = gmx_fjsp_set1_v2r8(rswitch_scalar);
+    /* Setup switch parameters */
+    d_scalar         = rcutoff_scalar-rswitch_scalar;
+    d                = gmx_fjsp_set1_v2r8(d_scalar);
+    swV3             = gmx_fjsp_set1_v2r8(-10.0/(d_scalar*d_scalar*d_scalar));
+    swV4             = gmx_fjsp_set1_v2r8( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar));
+    swV5             = gmx_fjsp_set1_v2r8( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
+    swF2             = gmx_fjsp_set1_v2r8(-30.0/(d_scalar*d_scalar*d_scalar));
+    swF3             = gmx_fjsp_set1_v2r8( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar));
+    swF4             = gmx_fjsp_set1_v2r8(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_fjsp_load_shift_and_1rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,&ix0,&iy0,&iz0);
+
+        fix0             = _fjsp_setzero_v2r8();
+        fiy0             = _fjsp_setzero_v2r8();
+        fiz0             = _fjsp_setzero_v2r8();
+
+        /* Load parameters for i particles */
+        iq0              = _fjsp_mul_v2r8(facel,gmx_fjsp_load1_v2r8(charge+inr+0));
+        vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+        /* Reset potential sums */
+        velecsum         = _fjsp_setzero_v2r8();
+        vvdwsum          = _fjsp_setzero_v2r8();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_fjsp_load_2real_swizzle_v2r8(charge+jnrA+0,charge+jnrB+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
+            {
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _fjsp_mul_v2r8(iq0,jq0);
+            gmx_fjsp_load_2pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,&c6_00,&c12_00);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r00,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq00,_fjsp_sub_v2r8(rinv00,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq00,rinv00),_fjsp_sub_v2r8(rinvsq00,felec));
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+            vvdw6            = _fjsp_mul_v2r8(c6_00,rinvsix);
+            vvdw12           = _fjsp_mul_v2r8(c12_00,_fjsp_mul_v2r8(rinvsix,rinvsix));
+            vvdw             = _fjsp_msub_v2r8( vvdw12,one_twelfth, _fjsp_mul_v2r8(vvdw6,one_sixth) );
+            fvdw             = _fjsp_mul_v2r8(_fjsp_sub_v2r8(vvdw12,vvdw6),rinvsq00);
+
+            d                = _fjsp_sub_v2r8(r00,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv00,_fjsp_mul_v2r8(velec,dsw)) );
+            fvdw             = _fjsp_msub_v2r8( fvdw,sw , _fjsp_mul_v2r8(rinv00,_fjsp_mul_v2r8(vvdw,dsw)) );
+            velec            = _fjsp_mul_v2r8(velec,sw);
+            vvdw             = _fjsp_mul_v2r8(vvdw,sw);
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+            vvdw             = _fjsp_and_v2r8(vvdw,cutoff_mask);
+            vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
+
+            fscal            = _fjsp_add_v2r8(felec,fvdw);
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            gmx_fjsp_decrement_fma_1rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fscal,dx00,dy00,dz00);
+
+            }
+
+            /* Inner loop uses 86 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+
+            /* Load parameters for j particles */
+            jq0              = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),charge+jnrA+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
+            {
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _fjsp_mul_v2r8(iq0,jq0);
+            gmx_fjsp_load_1pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,&c6_00,&c12_00);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r00,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq00,_fjsp_sub_v2r8(rinv00,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq00,rinv00),_fjsp_sub_v2r8(rinvsq00,felec));
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+            vvdw6            = _fjsp_mul_v2r8(c6_00,rinvsix);
+            vvdw12           = _fjsp_mul_v2r8(c12_00,_fjsp_mul_v2r8(rinvsix,rinvsix));
+            vvdw             = _fjsp_msub_v2r8( vvdw12,one_twelfth, _fjsp_mul_v2r8(vvdw6,one_sixth) );
+            fvdw             = _fjsp_mul_v2r8(_fjsp_sub_v2r8(vvdw12,vvdw6),rinvsq00);
+
+            d                = _fjsp_sub_v2r8(r00,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv00,_fjsp_mul_v2r8(velec,dsw)) );
+            fvdw             = _fjsp_msub_v2r8( fvdw,sw , _fjsp_mul_v2r8(rinv00,_fjsp_mul_v2r8(vvdw,dsw)) );
+            velec            = _fjsp_mul_v2r8(velec,sw);
+            vvdw             = _fjsp_mul_v2r8(vvdw,sw);
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+            vvdw             = _fjsp_and_v2r8(vvdw,cutoff_mask);
+            vvdw             = _fjsp_unpacklo_v2r8(vvdw,_fjsp_setzero_v2r8());
+            vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
+
+            fscal            = _fjsp_add_v2r8(felec,fvdw);
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            gmx_fjsp_decrement_fma_1rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fscal,dx00,dy00,dz00);
+
+            }
+
+            /* Inner loop uses 86 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_fjsp_update_iforce_1atom_swizzle_v2r8(fix0,fiy0,fiz0,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        ggid                        = gid[iidx];
+        /* Update potential energies */
+        gmx_fjsp_update_1pot_v2r8(velecsum,kernel_data->energygrp_elec+ggid);
+        gmx_fjsp_update_1pot_v2r8(vvdwsum,kernel_data->energygrp_vdw+ggid);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 9 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_VF,outeriter*9 + inneriter*86);
+}
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecEwSw_VdwLJSw_GeomP1P1_F_sparc64_hpc_ace_double
+ * Electrostatics interaction: Ewald
+ * VdW interaction:            LennardJones
+ * Geometry:                   Particle-Particle
+ * Calculate force/pot:        Force
+ */
+void
+nb_kernel_ElecEwSw_VdwLJSw_GeomP1P1_F_sparc64_hpc_ace_double
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwjidx0A,vdwjidx0B;
+    _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
+    _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
+    _fjsp_v2r8       ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+    real             *ewtab;
+    _fjsp_v2r8       rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
+    real             rswitch_scalar,d_scalar;
+    _fjsp_v2r8       itab_tmp;
+    _fjsp_v2r8       dummy_mask,cutoff_mask;
+    _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+    _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+    union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+
+    sh_ewald         = gmx_fjsp_set1_v2r8(fr->ic->sh_ewald);
+    ewtab            = fr->ic->tabq_coul_FDV0;
+    ewtabscale       = gmx_fjsp_set1_v2r8(fr->ic->tabq_scale);
+    ewtabhalfspace   = gmx_fjsp_set1_v2r8(0.5/fr->ic->tabq_scale);
+
+    /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+    rcutoff_scalar   = fr->rcoulomb;
+    rcutoff          = gmx_fjsp_set1_v2r8(rcutoff_scalar);
+    rcutoff2         = _fjsp_mul_v2r8(rcutoff,rcutoff);
+
+    rswitch_scalar   = fr->rcoulomb_switch;
+    rswitch          = gmx_fjsp_set1_v2r8(rswitch_scalar);
+    /* Setup switch parameters */
+    d_scalar         = rcutoff_scalar-rswitch_scalar;
+    d                = gmx_fjsp_set1_v2r8(d_scalar);
+    swV3             = gmx_fjsp_set1_v2r8(-10.0/(d_scalar*d_scalar*d_scalar));
+    swV4             = gmx_fjsp_set1_v2r8( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar));
+    swV5             = gmx_fjsp_set1_v2r8( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
+    swF2             = gmx_fjsp_set1_v2r8(-30.0/(d_scalar*d_scalar*d_scalar));
+    swF3             = gmx_fjsp_set1_v2r8( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar));
+    swF4             = gmx_fjsp_set1_v2r8(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_fjsp_load_shift_and_1rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,&ix0,&iy0,&iz0);
+
+        fix0             = _fjsp_setzero_v2r8();
+        fiy0             = _fjsp_setzero_v2r8();
+        fiz0             = _fjsp_setzero_v2r8();
+
+        /* Load parameters for i particles */
+        iq0              = _fjsp_mul_v2r8(facel,gmx_fjsp_load1_v2r8(charge+inr+0));
+        vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_fjsp_load_2real_swizzle_v2r8(charge+jnrA+0,charge+jnrB+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
+            {
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _fjsp_mul_v2r8(iq0,jq0);
+            gmx_fjsp_load_2pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,&c6_00,&c12_00);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r00,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq00,_fjsp_sub_v2r8(rinv00,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq00,rinv00),_fjsp_sub_v2r8(rinvsq00,felec));
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+            vvdw6            = _fjsp_mul_v2r8(c6_00,rinvsix);
+            vvdw12           = _fjsp_mul_v2r8(c12_00,_fjsp_mul_v2r8(rinvsix,rinvsix));
+            vvdw             = _fjsp_msub_v2r8( vvdw12,one_twelfth, _fjsp_mul_v2r8(vvdw6,one_sixth) );
+            fvdw             = _fjsp_mul_v2r8(_fjsp_sub_v2r8(vvdw12,vvdw6),rinvsq00);
+
+            d                = _fjsp_sub_v2r8(r00,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv00,_fjsp_mul_v2r8(velec,dsw)) );
+            fvdw             = _fjsp_msub_v2r8( fvdw,sw , _fjsp_mul_v2r8(rinv00,_fjsp_mul_v2r8(vvdw,dsw)) );
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
+
+            fscal            = _fjsp_add_v2r8(felec,fvdw);
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            gmx_fjsp_decrement_fma_1rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fscal,dx00,dy00,dz00);
+
+            }
+
+            /* Inner loop uses 80 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+
+            /* Load parameters for j particles */
+            jq0              = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),charge+jnrA+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
+            {
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _fjsp_mul_v2r8(iq0,jq0);
+            gmx_fjsp_load_1pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,&c6_00,&c12_00);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r00,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq00,_fjsp_sub_v2r8(rinv00,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq00,rinv00),_fjsp_sub_v2r8(rinvsq00,felec));
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+            vvdw6            = _fjsp_mul_v2r8(c6_00,rinvsix);
+            vvdw12           = _fjsp_mul_v2r8(c12_00,_fjsp_mul_v2r8(rinvsix,rinvsix));
+            vvdw             = _fjsp_msub_v2r8( vvdw12,one_twelfth, _fjsp_mul_v2r8(vvdw6,one_sixth) );
+            fvdw             = _fjsp_mul_v2r8(_fjsp_sub_v2r8(vvdw12,vvdw6),rinvsq00);
+
+            d                = _fjsp_sub_v2r8(r00,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv00,_fjsp_mul_v2r8(velec,dsw)) );
+            fvdw             = _fjsp_msub_v2r8( fvdw,sw , _fjsp_mul_v2r8(rinv00,_fjsp_mul_v2r8(vvdw,dsw)) );
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
+
+            fscal            = _fjsp_add_v2r8(felec,fvdw);
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            gmx_fjsp_decrement_fma_1rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fscal,dx00,dy00,dz00);
+
+            }
+
+            /* Inner loop uses 80 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_fjsp_update_iforce_1atom_swizzle_v2r8(fix0,fiy0,fiz0,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 7 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_F,outeriter*7 + inneriter*80);
+}
diff --git a/src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecEwSw_VdwLJSw_GeomW3P1_sparc64_hpc_ace_double.c b/src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecEwSw_VdwLJSw_GeomW3P1_sparc64_hpc_ace_double.c
new file mode 100644 (file)
index 0000000..9fd2aff
--- /dev/null
@@ -0,0 +1,1365 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2012, by the GROMACS development team, led by
+ * David van der Spoel, Berk Hess, Erik Lindahl, and including many
+ * others, as listed in the AUTHORS file in the top-level source
+ * directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*
+ * Note: this file was generated by the GROMACS sparc64_hpc_ace_double kernel generator.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "kernelutil_sparc64_hpc_ace_double.h"
+
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecEwSw_VdwLJSw_GeomW3P1_VF_sparc64_hpc_ace_double
+ * Electrostatics interaction: Ewald
+ * VdW interaction:            LennardJones
+ * Geometry:                   Water3-Particle
+ * Calculate force/pot:        PotentialAndForce
+ */
+void
+nb_kernel_ElecEwSw_VdwLJSw_GeomW3P1_VF_sparc64_hpc_ace_double
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwjidx0A,vdwjidx0B;
+    _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    _fjsp_v2r8       dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    _fjsp_v2r8       dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
+    _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
+    _fjsp_v2r8       ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+    real             *ewtab;
+    _fjsp_v2r8       rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
+    real             rswitch_scalar,d_scalar;
+    _fjsp_v2r8       itab_tmp;
+    _fjsp_v2r8       dummy_mask,cutoff_mask;
+    _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+    _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+    union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+
+    sh_ewald         = gmx_fjsp_set1_v2r8(fr->ic->sh_ewald);
+    ewtab            = fr->ic->tabq_coul_FDV0;
+    ewtabscale       = gmx_fjsp_set1_v2r8(fr->ic->tabq_scale);
+    ewtabhalfspace   = gmx_fjsp_set1_v2r8(0.5/fr->ic->tabq_scale);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq0              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+0]));
+    iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+    iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+    vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+    /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+    rcutoff_scalar   = fr->rcoulomb;
+    rcutoff          = gmx_fjsp_set1_v2r8(rcutoff_scalar);
+    rcutoff2         = _fjsp_mul_v2r8(rcutoff,rcutoff);
+
+    rswitch_scalar   = fr->rcoulomb_switch;
+    rswitch          = gmx_fjsp_set1_v2r8(rswitch_scalar);
+    /* Setup switch parameters */
+    d_scalar         = rcutoff_scalar-rswitch_scalar;
+    d                = gmx_fjsp_set1_v2r8(d_scalar);
+    swV3             = gmx_fjsp_set1_v2r8(-10.0/(d_scalar*d_scalar*d_scalar));
+    swV4             = gmx_fjsp_set1_v2r8( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar));
+    swV5             = gmx_fjsp_set1_v2r8( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
+    swF2             = gmx_fjsp_set1_v2r8(-30.0/(d_scalar*d_scalar*d_scalar));
+    swF3             = gmx_fjsp_set1_v2r8( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar));
+    swF4             = gmx_fjsp_set1_v2r8(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_fjsp_load_shift_and_3rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
+
+        fix0             = _fjsp_setzero_v2r8();
+        fiy0             = _fjsp_setzero_v2r8();
+        fiz0             = _fjsp_setzero_v2r8();
+        fix1             = _fjsp_setzero_v2r8();
+        fiy1             = _fjsp_setzero_v2r8();
+        fiz1             = _fjsp_setzero_v2r8();
+        fix2             = _fjsp_setzero_v2r8();
+        fiy2             = _fjsp_setzero_v2r8();
+        fiz2             = _fjsp_setzero_v2r8();
+
+        /* Reset potential sums */
+        velecsum         = _fjsp_setzero_v2r8();
+        vvdwsum          = _fjsp_setzero_v2r8();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+            rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+            rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_fjsp_load_2real_swizzle_v2r8(charge+jnrA+0,charge+jnrB+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
+            {
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _fjsp_mul_v2r8(iq0,jq0);
+            gmx_fjsp_load_2pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,&c6_00,&c12_00);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r00,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq00,_fjsp_sub_v2r8(rinv00,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq00,rinv00),_fjsp_sub_v2r8(rinvsq00,felec));
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+            vvdw6            = _fjsp_mul_v2r8(c6_00,rinvsix);
+            vvdw12           = _fjsp_mul_v2r8(c12_00,_fjsp_mul_v2r8(rinvsix,rinvsix));
+            vvdw             = _fjsp_msub_v2r8( vvdw12,one_twelfth, _fjsp_mul_v2r8(vvdw6,one_sixth) );
+            fvdw             = _fjsp_mul_v2r8(_fjsp_sub_v2r8(vvdw12,vvdw6),rinvsq00);
+
+            d                = _fjsp_sub_v2r8(r00,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv00,_fjsp_mul_v2r8(velec,dsw)) );
+            fvdw             = _fjsp_msub_v2r8( fvdw,sw , _fjsp_mul_v2r8(rinv00,_fjsp_mul_v2r8(vvdw,dsw)) );
+            velec            = _fjsp_mul_v2r8(velec,sw);
+            vvdw             = _fjsp_mul_v2r8(vvdw,sw);
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+            vvdw             = _fjsp_and_v2r8(vvdw,cutoff_mask);
+            vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
+
+            fscal            = _fjsp_add_v2r8(felec,fvdw);
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq10,rcutoff2))
+            {
+
+            r10              = _fjsp_mul_v2r8(rsq10,rinv10);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _fjsp_mul_v2r8(iq1,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r10,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq10,_fjsp_sub_v2r8(rinv10,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq10,rinv10),_fjsp_sub_v2r8(rinvsq10,felec));
+
+            d                = _fjsp_sub_v2r8(r10,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv10,_fjsp_mul_v2r8(velec,dsw)) );
+            velec            = _fjsp_mul_v2r8(velec,sw);
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq10,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq20,rcutoff2))
+            {
+
+            r20              = _fjsp_mul_v2r8(rsq20,rinv20);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _fjsp_mul_v2r8(iq2,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r20,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq20,_fjsp_sub_v2r8(rinv20,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq20,rinv20),_fjsp_sub_v2r8(rinvsq20,felec));
+
+            d                = _fjsp_sub_v2r8(r20,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv20,_fjsp_mul_v2r8(velec,dsw)) );
+            velec            = _fjsp_mul_v2r8(velec,sw);
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq20,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            }
+
+            gmx_fjsp_decrement_1rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 225 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+            rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+            rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+
+            /* Load parameters for j particles */
+            jq0              = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),charge+jnrA+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
+            {
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _fjsp_mul_v2r8(iq0,jq0);
+            gmx_fjsp_load_1pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,&c6_00,&c12_00);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r00,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq00,_fjsp_sub_v2r8(rinv00,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq00,rinv00),_fjsp_sub_v2r8(rinvsq00,felec));
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+            vvdw6            = _fjsp_mul_v2r8(c6_00,rinvsix);
+            vvdw12           = _fjsp_mul_v2r8(c12_00,_fjsp_mul_v2r8(rinvsix,rinvsix));
+            vvdw             = _fjsp_msub_v2r8( vvdw12,one_twelfth, _fjsp_mul_v2r8(vvdw6,one_sixth) );
+            fvdw             = _fjsp_mul_v2r8(_fjsp_sub_v2r8(vvdw12,vvdw6),rinvsq00);
+
+            d                = _fjsp_sub_v2r8(r00,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv00,_fjsp_mul_v2r8(velec,dsw)) );
+            fvdw             = _fjsp_msub_v2r8( fvdw,sw , _fjsp_mul_v2r8(rinv00,_fjsp_mul_v2r8(vvdw,dsw)) );
+            velec            = _fjsp_mul_v2r8(velec,sw);
+            vvdw             = _fjsp_mul_v2r8(vvdw,sw);
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+            vvdw             = _fjsp_and_v2r8(vvdw,cutoff_mask);
+            vvdw             = _fjsp_unpacklo_v2r8(vvdw,_fjsp_setzero_v2r8());
+            vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
+
+            fscal            = _fjsp_add_v2r8(felec,fvdw);
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq10,rcutoff2))
+            {
+
+            r10              = _fjsp_mul_v2r8(rsq10,rinv10);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _fjsp_mul_v2r8(iq1,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r10,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq10,_fjsp_sub_v2r8(rinv10,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq10,rinv10),_fjsp_sub_v2r8(rinvsq10,felec));
+
+            d                = _fjsp_sub_v2r8(r10,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv10,_fjsp_mul_v2r8(velec,dsw)) );
+            velec            = _fjsp_mul_v2r8(velec,sw);
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq10,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq20,rcutoff2))
+            {
+
+            r20              = _fjsp_mul_v2r8(rsq20,rinv20);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _fjsp_mul_v2r8(iq2,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r20,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq20,_fjsp_sub_v2r8(rinv20,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq20,rinv20),_fjsp_sub_v2r8(rinvsq20,felec));
+
+            d                = _fjsp_sub_v2r8(r20,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv20,_fjsp_mul_v2r8(velec,dsw)) );
+            velec            = _fjsp_mul_v2r8(velec,sw);
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq20,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            }
+
+            gmx_fjsp_decrement_1rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 225 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_fjsp_update_iforce_3atom_swizzle_v2r8(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        ggid                        = gid[iidx];
+        /* Update potential energies */
+        gmx_fjsp_update_1pot_v2r8(velecsum,kernel_data->energygrp_elec+ggid);
+        gmx_fjsp_update_1pot_v2r8(vvdwsum,kernel_data->energygrp_vdw+ggid);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 20 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3_VF,outeriter*20 + inneriter*225);
+}
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecEwSw_VdwLJSw_GeomW3P1_F_sparc64_hpc_ace_double
+ * Electrostatics interaction: Ewald
+ * VdW interaction:            LennardJones
+ * Geometry:                   Water3-Particle
+ * Calculate force/pot:        Force
+ */
+void
+nb_kernel_ElecEwSw_VdwLJSw_GeomW3P1_F_sparc64_hpc_ace_double
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwjidx0A,vdwjidx0B;
+    _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    _fjsp_v2r8       dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    _fjsp_v2r8       dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
+    _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
+    _fjsp_v2r8       ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+    real             *ewtab;
+    _fjsp_v2r8       rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
+    real             rswitch_scalar,d_scalar;
+    _fjsp_v2r8       itab_tmp;
+    _fjsp_v2r8       dummy_mask,cutoff_mask;
+    _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+    _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+    union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+
+    sh_ewald         = gmx_fjsp_set1_v2r8(fr->ic->sh_ewald);
+    ewtab            = fr->ic->tabq_coul_FDV0;
+    ewtabscale       = gmx_fjsp_set1_v2r8(fr->ic->tabq_scale);
+    ewtabhalfspace   = gmx_fjsp_set1_v2r8(0.5/fr->ic->tabq_scale);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq0              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+0]));
+    iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+    iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+    vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+    /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+    rcutoff_scalar   = fr->rcoulomb;
+    rcutoff          = gmx_fjsp_set1_v2r8(rcutoff_scalar);
+    rcutoff2         = _fjsp_mul_v2r8(rcutoff,rcutoff);
+
+    rswitch_scalar   = fr->rcoulomb_switch;
+    rswitch          = gmx_fjsp_set1_v2r8(rswitch_scalar);
+    /* Setup switch parameters */
+    d_scalar         = rcutoff_scalar-rswitch_scalar;
+    d                = gmx_fjsp_set1_v2r8(d_scalar);
+    swV3             = gmx_fjsp_set1_v2r8(-10.0/(d_scalar*d_scalar*d_scalar));
+    swV4             = gmx_fjsp_set1_v2r8( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar));
+    swV5             = gmx_fjsp_set1_v2r8( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
+    swF2             = gmx_fjsp_set1_v2r8(-30.0/(d_scalar*d_scalar*d_scalar));
+    swF3             = gmx_fjsp_set1_v2r8( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar));
+    swF4             = gmx_fjsp_set1_v2r8(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_fjsp_load_shift_and_3rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
+
+        fix0             = _fjsp_setzero_v2r8();
+        fiy0             = _fjsp_setzero_v2r8();
+        fiz0             = _fjsp_setzero_v2r8();
+        fix1             = _fjsp_setzero_v2r8();
+        fiy1             = _fjsp_setzero_v2r8();
+        fiz1             = _fjsp_setzero_v2r8();
+        fix2             = _fjsp_setzero_v2r8();
+        fiy2             = _fjsp_setzero_v2r8();
+        fiz2             = _fjsp_setzero_v2r8();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+            rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+            rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_fjsp_load_2real_swizzle_v2r8(charge+jnrA+0,charge+jnrB+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
+            {
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _fjsp_mul_v2r8(iq0,jq0);
+            gmx_fjsp_load_2pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,&c6_00,&c12_00);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r00,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq00,_fjsp_sub_v2r8(rinv00,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq00,rinv00),_fjsp_sub_v2r8(rinvsq00,felec));
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+            vvdw6            = _fjsp_mul_v2r8(c6_00,rinvsix);
+            vvdw12           = _fjsp_mul_v2r8(c12_00,_fjsp_mul_v2r8(rinvsix,rinvsix));
+            vvdw             = _fjsp_msub_v2r8( vvdw12,one_twelfth, _fjsp_mul_v2r8(vvdw6,one_sixth) );
+            fvdw             = _fjsp_mul_v2r8(_fjsp_sub_v2r8(vvdw12,vvdw6),rinvsq00);
+
+            d                = _fjsp_sub_v2r8(r00,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv00,_fjsp_mul_v2r8(velec,dsw)) );
+            fvdw             = _fjsp_msub_v2r8( fvdw,sw , _fjsp_mul_v2r8(rinv00,_fjsp_mul_v2r8(vvdw,dsw)) );
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
+
+            fscal            = _fjsp_add_v2r8(felec,fvdw);
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq10,rcutoff2))
+            {
+
+            r10              = _fjsp_mul_v2r8(rsq10,rinv10);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _fjsp_mul_v2r8(iq1,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r10,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq10,_fjsp_sub_v2r8(rinv10,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq10,rinv10),_fjsp_sub_v2r8(rinvsq10,felec));
+
+            d                = _fjsp_sub_v2r8(r10,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv10,_fjsp_mul_v2r8(velec,dsw)) );
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq10,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq20,rcutoff2))
+            {
+
+            r20              = _fjsp_mul_v2r8(rsq20,rinv20);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _fjsp_mul_v2r8(iq2,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r20,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq20,_fjsp_sub_v2r8(rinv20,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq20,rinv20),_fjsp_sub_v2r8(rinvsq20,felec));
+
+            d                = _fjsp_sub_v2r8(r20,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv20,_fjsp_mul_v2r8(velec,dsw)) );
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq20,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            }
+
+            gmx_fjsp_decrement_1rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 213 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+            rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+            rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+
+            /* Load parameters for j particles */
+            jq0              = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),charge+jnrA+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
+            {
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _fjsp_mul_v2r8(iq0,jq0);
+            gmx_fjsp_load_1pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,&c6_00,&c12_00);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r00,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq00,_fjsp_sub_v2r8(rinv00,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq00,rinv00),_fjsp_sub_v2r8(rinvsq00,felec));
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+            vvdw6            = _fjsp_mul_v2r8(c6_00,rinvsix);
+            vvdw12           = _fjsp_mul_v2r8(c12_00,_fjsp_mul_v2r8(rinvsix,rinvsix));
+            vvdw             = _fjsp_msub_v2r8( vvdw12,one_twelfth, _fjsp_mul_v2r8(vvdw6,one_sixth) );
+            fvdw             = _fjsp_mul_v2r8(_fjsp_sub_v2r8(vvdw12,vvdw6),rinvsq00);
+
+            d                = _fjsp_sub_v2r8(r00,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv00,_fjsp_mul_v2r8(velec,dsw)) );
+            fvdw             = _fjsp_msub_v2r8( fvdw,sw , _fjsp_mul_v2r8(rinv00,_fjsp_mul_v2r8(vvdw,dsw)) );
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
+
+            fscal            = _fjsp_add_v2r8(felec,fvdw);
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq10,rcutoff2))
+            {
+
+            r10              = _fjsp_mul_v2r8(rsq10,rinv10);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _fjsp_mul_v2r8(iq1,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r10,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq10,_fjsp_sub_v2r8(rinv10,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq10,rinv10),_fjsp_sub_v2r8(rinvsq10,felec));
+
+            d                = _fjsp_sub_v2r8(r10,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv10,_fjsp_mul_v2r8(velec,dsw)) );
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq10,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq20,rcutoff2))
+            {
+
+            r20              = _fjsp_mul_v2r8(rsq20,rinv20);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _fjsp_mul_v2r8(iq2,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r20,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq20,_fjsp_sub_v2r8(rinv20,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq20,rinv20),_fjsp_sub_v2r8(rinvsq20,felec));
+
+            d                = _fjsp_sub_v2r8(r20,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv20,_fjsp_mul_v2r8(velec,dsw)) );
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq20,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            }
+
+            gmx_fjsp_decrement_1rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 213 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_fjsp_update_iforce_3atom_swizzle_v2r8(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 18 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3_F,outeriter*18 + inneriter*213);
+}
diff --git a/src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecEwSw_VdwLJSw_GeomW3W3_sparc64_hpc_ace_double.c b/src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecEwSw_VdwLJSw_GeomW3W3_sparc64_hpc_ace_double.c
new file mode 100644 (file)
index 0000000..4f7cf4c
--- /dev/null
@@ -0,0 +1,2935 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2012, by the GROMACS development team, led by
+ * David van der Spoel, Berk Hess, Erik Lindahl, and including many
+ * others, as listed in the AUTHORS file in the top-level source
+ * directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*
+ * Note: this file was generated by the GROMACS sparc64_hpc_ace_double kernel generator.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "kernelutil_sparc64_hpc_ace_double.h"
+
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecEwSw_VdwLJSw_GeomW3W3_VF_sparc64_hpc_ace_double
+ * Electrostatics interaction: Ewald
+ * VdW interaction:            LennardJones
+ * Geometry:                   Water3-Water3
+ * Calculate force/pot:        PotentialAndForce
+ */
+void
+nb_kernel_ElecEwSw_VdwLJSw_GeomW3W3_VF_sparc64_hpc_ace_double
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwjidx0A,vdwjidx0B;
+    _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    int              vdwjidx1A,vdwjidx1B;
+    _fjsp_v2r8       jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+    int              vdwjidx2A,vdwjidx2B;
+    _fjsp_v2r8       jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+    _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    _fjsp_v2r8       dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
+    _fjsp_v2r8       dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
+    _fjsp_v2r8       dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    _fjsp_v2r8       dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+    _fjsp_v2r8       dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+    _fjsp_v2r8       dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    _fjsp_v2r8       dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+    _fjsp_v2r8       dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+    _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
+    _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
+    _fjsp_v2r8       ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+    real             *ewtab;
+    _fjsp_v2r8       rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
+    real             rswitch_scalar,d_scalar;
+    _fjsp_v2r8       itab_tmp;
+    _fjsp_v2r8       dummy_mask,cutoff_mask;
+    _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+    _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+    union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+
+    sh_ewald         = gmx_fjsp_set1_v2r8(fr->ic->sh_ewald);
+    ewtab            = fr->ic->tabq_coul_FDV0;
+    ewtabscale       = gmx_fjsp_set1_v2r8(fr->ic->tabq_scale);
+    ewtabhalfspace   = gmx_fjsp_set1_v2r8(0.5/fr->ic->tabq_scale);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq0              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+0]));
+    iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+    iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+    vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+    jq0              = gmx_fjsp_set1_v2r8(charge[inr+0]);
+    jq1              = gmx_fjsp_set1_v2r8(charge[inr+1]);
+    jq2              = gmx_fjsp_set1_v2r8(charge[inr+2]);
+    vdwjidx0A        = 2*vdwtype[inr+0];
+    qq00             = _fjsp_mul_v2r8(iq0,jq0);
+    c6_00            = gmx_fjsp_set1_v2r8(vdwparam[vdwioffset0+vdwjidx0A]);
+    c12_00           = gmx_fjsp_set1_v2r8(vdwparam[vdwioffset0+vdwjidx0A+1]);
+    qq01             = _fjsp_mul_v2r8(iq0,jq1);
+    qq02             = _fjsp_mul_v2r8(iq0,jq2);
+    qq10             = _fjsp_mul_v2r8(iq1,jq0);
+    qq11             = _fjsp_mul_v2r8(iq1,jq1);
+    qq12             = _fjsp_mul_v2r8(iq1,jq2);
+    qq20             = _fjsp_mul_v2r8(iq2,jq0);
+    qq21             = _fjsp_mul_v2r8(iq2,jq1);
+    qq22             = _fjsp_mul_v2r8(iq2,jq2);
+
+    /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+    rcutoff_scalar   = fr->rcoulomb;
+    rcutoff          = gmx_fjsp_set1_v2r8(rcutoff_scalar);
+    rcutoff2         = _fjsp_mul_v2r8(rcutoff,rcutoff);
+
+    rswitch_scalar   = fr->rcoulomb_switch;
+    rswitch          = gmx_fjsp_set1_v2r8(rswitch_scalar);
+    /* Setup switch parameters */
+    d_scalar         = rcutoff_scalar-rswitch_scalar;
+    d                = gmx_fjsp_set1_v2r8(d_scalar);
+    swV3             = gmx_fjsp_set1_v2r8(-10.0/(d_scalar*d_scalar*d_scalar));
+    swV4             = gmx_fjsp_set1_v2r8( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar));
+    swV5             = gmx_fjsp_set1_v2r8( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
+    swF2             = gmx_fjsp_set1_v2r8(-30.0/(d_scalar*d_scalar*d_scalar));
+    swF3             = gmx_fjsp_set1_v2r8( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar));
+    swF4             = gmx_fjsp_set1_v2r8(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_fjsp_load_shift_and_3rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
+
+        fix0             = _fjsp_setzero_v2r8();
+        fiy0             = _fjsp_setzero_v2r8();
+        fiz0             = _fjsp_setzero_v2r8();
+        fix1             = _fjsp_setzero_v2r8();
+        fiy1             = _fjsp_setzero_v2r8();
+        fiz1             = _fjsp_setzero_v2r8();
+        fix2             = _fjsp_setzero_v2r8();
+        fiy2             = _fjsp_setzero_v2r8();
+        fiz2             = _fjsp_setzero_v2r8();
+
+        /* Reset potential sums */
+        velecsum         = _fjsp_setzero_v2r8();
+        vvdwsum          = _fjsp_setzero_v2r8();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_3rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx01             = _fjsp_sub_v2r8(ix0,jx1);
+            dy01             = _fjsp_sub_v2r8(iy0,jy1);
+            dz01             = _fjsp_sub_v2r8(iz0,jz1);
+            dx02             = _fjsp_sub_v2r8(ix0,jx2);
+            dy02             = _fjsp_sub_v2r8(iy0,jy2);
+            dz02             = _fjsp_sub_v2r8(iz0,jz2);
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx11             = _fjsp_sub_v2r8(ix1,jx1);
+            dy11             = _fjsp_sub_v2r8(iy1,jy1);
+            dz11             = _fjsp_sub_v2r8(iz1,jz1);
+            dx12             = _fjsp_sub_v2r8(ix1,jx2);
+            dy12             = _fjsp_sub_v2r8(iy1,jy2);
+            dz12             = _fjsp_sub_v2r8(iz1,jz2);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+            dx21             = _fjsp_sub_v2r8(ix2,jx1);
+            dy21             = _fjsp_sub_v2r8(iy2,jy1);
+            dz21             = _fjsp_sub_v2r8(iz2,jz1);
+            dx22             = _fjsp_sub_v2r8(ix2,jx2);
+            dy22             = _fjsp_sub_v2r8(iy2,jy2);
+            dz22             = _fjsp_sub_v2r8(iz2,jz2);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq01            = gmx_fjsp_calc_rsq_v2r8(dx01,dy01,dz01);
+            rsq02            = gmx_fjsp_calc_rsq_v2r8(dx02,dy02,dz02);
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+            rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+            rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+            rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv01           = gmx_fjsp_invsqrt_v2r8(rsq01);
+            rinv02           = gmx_fjsp_invsqrt_v2r8(rsq02);
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+            rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+            rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+            rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+            rinvsq01         = _fjsp_mul_v2r8(rinv01,rinv01);
+            rinvsq02         = _fjsp_mul_v2r8(rinv02,rinv02);
+            rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+            rinvsq11         = _fjsp_mul_v2r8(rinv11,rinv11);
+            rinvsq12         = _fjsp_mul_v2r8(rinv12,rinv12);
+            rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+            rinvsq21         = _fjsp_mul_v2r8(rinv21,rinv21);
+            rinvsq22         = _fjsp_mul_v2r8(rinv22,rinv22);
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+            fjx1             = _fjsp_setzero_v2r8();
+            fjy1             = _fjsp_setzero_v2r8();
+            fjz1             = _fjsp_setzero_v2r8();
+            fjx2             = _fjsp_setzero_v2r8();
+            fjy2             = _fjsp_setzero_v2r8();
+            fjz2             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
+            {
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r00,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq00,_fjsp_sub_v2r8(rinv00,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq00,rinv00),_fjsp_sub_v2r8(rinvsq00,felec));
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+            vvdw6            = _fjsp_mul_v2r8(c6_00,rinvsix);
+            vvdw12           = _fjsp_mul_v2r8(c12_00,_fjsp_mul_v2r8(rinvsix,rinvsix));
+            vvdw             = _fjsp_msub_v2r8( vvdw12,one_twelfth, _fjsp_mul_v2r8(vvdw6,one_sixth) );
+            fvdw             = _fjsp_mul_v2r8(_fjsp_sub_v2r8(vvdw12,vvdw6),rinvsq00);
+
+            d                = _fjsp_sub_v2r8(r00,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv00,_fjsp_mul_v2r8(velec,dsw)) );
+            fvdw             = _fjsp_msub_v2r8( fvdw,sw , _fjsp_mul_v2r8(rinv00,_fjsp_mul_v2r8(vvdw,dsw)) );
+            velec            = _fjsp_mul_v2r8(velec,sw);
+            vvdw             = _fjsp_mul_v2r8(vvdw,sw);
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+            vvdw             = _fjsp_and_v2r8(vvdw,cutoff_mask);
+            vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
+
+            fscal            = _fjsp_add_v2r8(felec,fvdw);
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq01,rcutoff2))
+            {
+
+            r01              = _fjsp_mul_v2r8(rsq01,rinv01);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r01,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq01,_fjsp_sub_v2r8(rinv01,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq01,rinv01),_fjsp_sub_v2r8(rinvsq01,felec));
+
+            d                = _fjsp_sub_v2r8(r01,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv01,_fjsp_mul_v2r8(velec,dsw)) );
+            velec            = _fjsp_mul_v2r8(velec,sw);
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq01,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx01,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy01,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz01,fscal,fiz0);
+            
+            fjx1             = _fjsp_madd_v2r8(dx01,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy01,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz01,fscal,fjz1);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq02,rcutoff2))
+            {
+
+            r02              = _fjsp_mul_v2r8(rsq02,rinv02);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r02,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq02,_fjsp_sub_v2r8(rinv02,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq02,rinv02),_fjsp_sub_v2r8(rinvsq02,felec));
+
+            d                = _fjsp_sub_v2r8(r02,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv02,_fjsp_mul_v2r8(velec,dsw)) );
+            velec            = _fjsp_mul_v2r8(velec,sw);
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq02,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx02,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy02,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz02,fscal,fiz0);
+            
+            fjx2             = _fjsp_madd_v2r8(dx02,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy02,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz02,fscal,fjz2);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq10,rcutoff2))
+            {
+
+            r10              = _fjsp_mul_v2r8(rsq10,rinv10);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r10,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq10,_fjsp_sub_v2r8(rinv10,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq10,rinv10),_fjsp_sub_v2r8(rinvsq10,felec));
+
+            d                = _fjsp_sub_v2r8(r10,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv10,_fjsp_mul_v2r8(velec,dsw)) );
+            velec            = _fjsp_mul_v2r8(velec,sw);
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq10,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq11,rcutoff2))
+            {
+
+            r11              = _fjsp_mul_v2r8(rsq11,rinv11);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r11,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq11,_fjsp_sub_v2r8(rinv11,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq11,rinv11),_fjsp_sub_v2r8(rinvsq11,felec));
+
+            d                = _fjsp_sub_v2r8(r11,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv11,_fjsp_mul_v2r8(velec,dsw)) );
+            velec            = _fjsp_mul_v2r8(velec,sw);
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq11,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+            
+            fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq12,rcutoff2))
+            {
+
+            r12              = _fjsp_mul_v2r8(rsq12,rinv12);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r12,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq12,_fjsp_sub_v2r8(rinv12,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq12,rinv12),_fjsp_sub_v2r8(rinvsq12,felec));
+
+            d                = _fjsp_sub_v2r8(r12,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv12,_fjsp_mul_v2r8(velec,dsw)) );
+            velec            = _fjsp_mul_v2r8(velec,sw);
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq12,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+            
+            fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq20,rcutoff2))
+            {
+
+            r20              = _fjsp_mul_v2r8(rsq20,rinv20);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r20,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq20,_fjsp_sub_v2r8(rinv20,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq20,rinv20),_fjsp_sub_v2r8(rinvsq20,felec));
+
+            d                = _fjsp_sub_v2r8(r20,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv20,_fjsp_mul_v2r8(velec,dsw)) );
+            velec            = _fjsp_mul_v2r8(velec,sw);
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq20,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq21,rcutoff2))
+            {
+
+            r21              = _fjsp_mul_v2r8(rsq21,rinv21);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r21,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq21,_fjsp_sub_v2r8(rinv21,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq21,rinv21),_fjsp_sub_v2r8(rinvsq21,felec));
+
+            d                = _fjsp_sub_v2r8(r21,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv21,_fjsp_mul_v2r8(velec,dsw)) );
+            velec            = _fjsp_mul_v2r8(velec,sw);
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq21,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+            
+            fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq22,rcutoff2))
+            {
+
+            r22              = _fjsp_mul_v2r8(rsq22,rinv22);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r22,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq22,_fjsp_sub_v2r8(rinv22,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq22,rinv22),_fjsp_sub_v2r8(rinvsq22,felec));
+
+            d                = _fjsp_sub_v2r8(r22,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv22,_fjsp_mul_v2r8(velec,dsw)) );
+            velec            = _fjsp_mul_v2r8(velec,sw);
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq22,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+            
+            fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+
+            }
+
+            gmx_fjsp_decrement_3rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+
+            /* Inner loop uses 630 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_3rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx01             = _fjsp_sub_v2r8(ix0,jx1);
+            dy01             = _fjsp_sub_v2r8(iy0,jy1);
+            dz01             = _fjsp_sub_v2r8(iz0,jz1);
+            dx02             = _fjsp_sub_v2r8(ix0,jx2);
+            dy02             = _fjsp_sub_v2r8(iy0,jy2);
+            dz02             = _fjsp_sub_v2r8(iz0,jz2);
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx11             = _fjsp_sub_v2r8(ix1,jx1);
+            dy11             = _fjsp_sub_v2r8(iy1,jy1);
+            dz11             = _fjsp_sub_v2r8(iz1,jz1);
+            dx12             = _fjsp_sub_v2r8(ix1,jx2);
+            dy12             = _fjsp_sub_v2r8(iy1,jy2);
+            dz12             = _fjsp_sub_v2r8(iz1,jz2);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+            dx21             = _fjsp_sub_v2r8(ix2,jx1);
+            dy21             = _fjsp_sub_v2r8(iy2,jy1);
+            dz21             = _fjsp_sub_v2r8(iz2,jz1);
+            dx22             = _fjsp_sub_v2r8(ix2,jx2);
+            dy22             = _fjsp_sub_v2r8(iy2,jy2);
+            dz22             = _fjsp_sub_v2r8(iz2,jz2);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq01            = gmx_fjsp_calc_rsq_v2r8(dx01,dy01,dz01);
+            rsq02            = gmx_fjsp_calc_rsq_v2r8(dx02,dy02,dz02);
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+            rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+            rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+            rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv01           = gmx_fjsp_invsqrt_v2r8(rsq01);
+            rinv02           = gmx_fjsp_invsqrt_v2r8(rsq02);
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+            rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+            rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+            rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+            rinvsq01         = _fjsp_mul_v2r8(rinv01,rinv01);
+            rinvsq02         = _fjsp_mul_v2r8(rinv02,rinv02);
+            rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+            rinvsq11         = _fjsp_mul_v2r8(rinv11,rinv11);
+            rinvsq12         = _fjsp_mul_v2r8(rinv12,rinv12);
+            rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+            rinvsq21         = _fjsp_mul_v2r8(rinv21,rinv21);
+            rinvsq22         = _fjsp_mul_v2r8(rinv22,rinv22);
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+            fjx1             = _fjsp_setzero_v2r8();
+            fjy1             = _fjsp_setzero_v2r8();
+            fjz1             = _fjsp_setzero_v2r8();
+            fjx2             = _fjsp_setzero_v2r8();
+            fjy2             = _fjsp_setzero_v2r8();
+            fjz2             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
+            {
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r00,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq00,_fjsp_sub_v2r8(rinv00,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq00,rinv00),_fjsp_sub_v2r8(rinvsq00,felec));
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+            vvdw6            = _fjsp_mul_v2r8(c6_00,rinvsix);
+            vvdw12           = _fjsp_mul_v2r8(c12_00,_fjsp_mul_v2r8(rinvsix,rinvsix));
+            vvdw             = _fjsp_msub_v2r8( vvdw12,one_twelfth, _fjsp_mul_v2r8(vvdw6,one_sixth) );
+            fvdw             = _fjsp_mul_v2r8(_fjsp_sub_v2r8(vvdw12,vvdw6),rinvsq00);
+
+            d                = _fjsp_sub_v2r8(r00,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv00,_fjsp_mul_v2r8(velec,dsw)) );
+            fvdw             = _fjsp_msub_v2r8( fvdw,sw , _fjsp_mul_v2r8(rinv00,_fjsp_mul_v2r8(vvdw,dsw)) );
+            velec            = _fjsp_mul_v2r8(velec,sw);
+            vvdw             = _fjsp_mul_v2r8(vvdw,sw);
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+            vvdw             = _fjsp_and_v2r8(vvdw,cutoff_mask);
+            vvdw             = _fjsp_unpacklo_v2r8(vvdw,_fjsp_setzero_v2r8());
+            vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
+
+            fscal            = _fjsp_add_v2r8(felec,fvdw);
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq01,rcutoff2))
+            {
+
+            r01              = _fjsp_mul_v2r8(rsq01,rinv01);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r01,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq01,_fjsp_sub_v2r8(rinv01,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq01,rinv01),_fjsp_sub_v2r8(rinvsq01,felec));
+
+            d                = _fjsp_sub_v2r8(r01,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv01,_fjsp_mul_v2r8(velec,dsw)) );
+            velec            = _fjsp_mul_v2r8(velec,sw);
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq01,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx01,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy01,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz01,fscal,fiz0);
+            
+            fjx1             = _fjsp_madd_v2r8(dx01,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy01,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz01,fscal,fjz1);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq02,rcutoff2))
+            {
+
+            r02              = _fjsp_mul_v2r8(rsq02,rinv02);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r02,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq02,_fjsp_sub_v2r8(rinv02,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq02,rinv02),_fjsp_sub_v2r8(rinvsq02,felec));
+
+            d                = _fjsp_sub_v2r8(r02,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv02,_fjsp_mul_v2r8(velec,dsw)) );
+            velec            = _fjsp_mul_v2r8(velec,sw);
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq02,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx02,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy02,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz02,fscal,fiz0);
+            
+            fjx2             = _fjsp_madd_v2r8(dx02,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy02,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz02,fscal,fjz2);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq10,rcutoff2))
+            {
+
+            r10              = _fjsp_mul_v2r8(rsq10,rinv10);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r10,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq10,_fjsp_sub_v2r8(rinv10,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq10,rinv10),_fjsp_sub_v2r8(rinvsq10,felec));
+
+            d                = _fjsp_sub_v2r8(r10,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv10,_fjsp_mul_v2r8(velec,dsw)) );
+            velec            = _fjsp_mul_v2r8(velec,sw);
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq10,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq11,rcutoff2))
+            {
+
+            r11              = _fjsp_mul_v2r8(rsq11,rinv11);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r11,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq11,_fjsp_sub_v2r8(rinv11,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq11,rinv11),_fjsp_sub_v2r8(rinvsq11,felec));
+
+            d                = _fjsp_sub_v2r8(r11,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv11,_fjsp_mul_v2r8(velec,dsw)) );
+            velec            = _fjsp_mul_v2r8(velec,sw);
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq11,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+            
+            fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq12,rcutoff2))
+            {
+
+            r12              = _fjsp_mul_v2r8(rsq12,rinv12);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r12,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq12,_fjsp_sub_v2r8(rinv12,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq12,rinv12),_fjsp_sub_v2r8(rinvsq12,felec));
+
+            d                = _fjsp_sub_v2r8(r12,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv12,_fjsp_mul_v2r8(velec,dsw)) );
+            velec            = _fjsp_mul_v2r8(velec,sw);
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq12,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+            
+            fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq20,rcutoff2))
+            {
+
+            r20              = _fjsp_mul_v2r8(rsq20,rinv20);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r20,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq20,_fjsp_sub_v2r8(rinv20,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq20,rinv20),_fjsp_sub_v2r8(rinvsq20,felec));
+
+            d                = _fjsp_sub_v2r8(r20,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv20,_fjsp_mul_v2r8(velec,dsw)) );
+            velec            = _fjsp_mul_v2r8(velec,sw);
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq20,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq21,rcutoff2))
+            {
+
+            r21              = _fjsp_mul_v2r8(rsq21,rinv21);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r21,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq21,_fjsp_sub_v2r8(rinv21,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq21,rinv21),_fjsp_sub_v2r8(rinvsq21,felec));
+
+            d                = _fjsp_sub_v2r8(r21,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv21,_fjsp_mul_v2r8(velec,dsw)) );
+            velec            = _fjsp_mul_v2r8(velec,sw);
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq21,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+            
+            fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq22,rcutoff2))
+            {
+
+            r22              = _fjsp_mul_v2r8(rsq22,rinv22);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r22,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq22,_fjsp_sub_v2r8(rinv22,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq22,rinv22),_fjsp_sub_v2r8(rinvsq22,felec));
+
+            d                = _fjsp_sub_v2r8(r22,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv22,_fjsp_mul_v2r8(velec,dsw)) );
+            velec            = _fjsp_mul_v2r8(velec,sw);
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq22,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+            
+            fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+
+            }
+
+            gmx_fjsp_decrement_3rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+
+            /* Inner loop uses 630 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_fjsp_update_iforce_3atom_swizzle_v2r8(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        ggid                        = gid[iidx];
+        /* Update potential energies */
+        gmx_fjsp_update_1pot_v2r8(velecsum,kernel_data->energygrp_elec+ggid);
+        gmx_fjsp_update_1pot_v2r8(vvdwsum,kernel_data->energygrp_vdw+ggid);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 20 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_VF,outeriter*20 + inneriter*630);
+}
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecEwSw_VdwLJSw_GeomW3W3_F_sparc64_hpc_ace_double
+ * Electrostatics interaction: Ewald
+ * VdW interaction:            LennardJones
+ * Geometry:                   Water3-Water3
+ * Calculate force/pot:        Force
+ */
+void
+nb_kernel_ElecEwSw_VdwLJSw_GeomW3W3_F_sparc64_hpc_ace_double
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwjidx0A,vdwjidx0B;
+    _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    int              vdwjidx1A,vdwjidx1B;
+    _fjsp_v2r8       jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+    int              vdwjidx2A,vdwjidx2B;
+    _fjsp_v2r8       jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+    _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    _fjsp_v2r8       dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
+    _fjsp_v2r8       dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
+    _fjsp_v2r8       dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    _fjsp_v2r8       dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+    _fjsp_v2r8       dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+    _fjsp_v2r8       dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    _fjsp_v2r8       dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+    _fjsp_v2r8       dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+    _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
+    _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
+    _fjsp_v2r8       ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+    real             *ewtab;
+    _fjsp_v2r8       rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
+    real             rswitch_scalar,d_scalar;
+    _fjsp_v2r8       itab_tmp;
+    _fjsp_v2r8       dummy_mask,cutoff_mask;
+    _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+    _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+    union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+
+    sh_ewald         = gmx_fjsp_set1_v2r8(fr->ic->sh_ewald);
+    ewtab            = fr->ic->tabq_coul_FDV0;
+    ewtabscale       = gmx_fjsp_set1_v2r8(fr->ic->tabq_scale);
+    ewtabhalfspace   = gmx_fjsp_set1_v2r8(0.5/fr->ic->tabq_scale);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq0              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+0]));
+    iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+    iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+    vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+    jq0              = gmx_fjsp_set1_v2r8(charge[inr+0]);
+    jq1              = gmx_fjsp_set1_v2r8(charge[inr+1]);
+    jq2              = gmx_fjsp_set1_v2r8(charge[inr+2]);
+    vdwjidx0A        = 2*vdwtype[inr+0];
+    qq00             = _fjsp_mul_v2r8(iq0,jq0);
+    c6_00            = gmx_fjsp_set1_v2r8(vdwparam[vdwioffset0+vdwjidx0A]);
+    c12_00           = gmx_fjsp_set1_v2r8(vdwparam[vdwioffset0+vdwjidx0A+1]);
+    qq01             = _fjsp_mul_v2r8(iq0,jq1);
+    qq02             = _fjsp_mul_v2r8(iq0,jq2);
+    qq10             = _fjsp_mul_v2r8(iq1,jq0);
+    qq11             = _fjsp_mul_v2r8(iq1,jq1);
+    qq12             = _fjsp_mul_v2r8(iq1,jq2);
+    qq20             = _fjsp_mul_v2r8(iq2,jq0);
+    qq21             = _fjsp_mul_v2r8(iq2,jq1);
+    qq22             = _fjsp_mul_v2r8(iq2,jq2);
+
+    /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+    rcutoff_scalar   = fr->rcoulomb;
+    rcutoff          = gmx_fjsp_set1_v2r8(rcutoff_scalar);
+    rcutoff2         = _fjsp_mul_v2r8(rcutoff,rcutoff);
+
+    rswitch_scalar   = fr->rcoulomb_switch;
+    rswitch          = gmx_fjsp_set1_v2r8(rswitch_scalar);
+    /* Setup switch parameters */
+    d_scalar         = rcutoff_scalar-rswitch_scalar;
+    d                = gmx_fjsp_set1_v2r8(d_scalar);
+    swV3             = gmx_fjsp_set1_v2r8(-10.0/(d_scalar*d_scalar*d_scalar));
+    swV4             = gmx_fjsp_set1_v2r8( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar));
+    swV5             = gmx_fjsp_set1_v2r8( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
+    swF2             = gmx_fjsp_set1_v2r8(-30.0/(d_scalar*d_scalar*d_scalar));
+    swF3             = gmx_fjsp_set1_v2r8( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar));
+    swF4             = gmx_fjsp_set1_v2r8(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_fjsp_load_shift_and_3rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
+
+        fix0             = _fjsp_setzero_v2r8();
+        fiy0             = _fjsp_setzero_v2r8();
+        fiz0             = _fjsp_setzero_v2r8();
+        fix1             = _fjsp_setzero_v2r8();
+        fiy1             = _fjsp_setzero_v2r8();
+        fiz1             = _fjsp_setzero_v2r8();
+        fix2             = _fjsp_setzero_v2r8();
+        fiy2             = _fjsp_setzero_v2r8();
+        fiz2             = _fjsp_setzero_v2r8();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_3rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx01             = _fjsp_sub_v2r8(ix0,jx1);
+            dy01             = _fjsp_sub_v2r8(iy0,jy1);
+            dz01             = _fjsp_sub_v2r8(iz0,jz1);
+            dx02             = _fjsp_sub_v2r8(ix0,jx2);
+            dy02             = _fjsp_sub_v2r8(iy0,jy2);
+            dz02             = _fjsp_sub_v2r8(iz0,jz2);
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx11             = _fjsp_sub_v2r8(ix1,jx1);
+            dy11             = _fjsp_sub_v2r8(iy1,jy1);
+            dz11             = _fjsp_sub_v2r8(iz1,jz1);
+            dx12             = _fjsp_sub_v2r8(ix1,jx2);
+            dy12             = _fjsp_sub_v2r8(iy1,jy2);
+            dz12             = _fjsp_sub_v2r8(iz1,jz2);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+            dx21             = _fjsp_sub_v2r8(ix2,jx1);
+            dy21             = _fjsp_sub_v2r8(iy2,jy1);
+            dz21             = _fjsp_sub_v2r8(iz2,jz1);
+            dx22             = _fjsp_sub_v2r8(ix2,jx2);
+            dy22             = _fjsp_sub_v2r8(iy2,jy2);
+            dz22             = _fjsp_sub_v2r8(iz2,jz2);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq01            = gmx_fjsp_calc_rsq_v2r8(dx01,dy01,dz01);
+            rsq02            = gmx_fjsp_calc_rsq_v2r8(dx02,dy02,dz02);
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+            rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+            rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+            rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv01           = gmx_fjsp_invsqrt_v2r8(rsq01);
+            rinv02           = gmx_fjsp_invsqrt_v2r8(rsq02);
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+            rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+            rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+            rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+            rinvsq01         = _fjsp_mul_v2r8(rinv01,rinv01);
+            rinvsq02         = _fjsp_mul_v2r8(rinv02,rinv02);
+            rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+            rinvsq11         = _fjsp_mul_v2r8(rinv11,rinv11);
+            rinvsq12         = _fjsp_mul_v2r8(rinv12,rinv12);
+            rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+            rinvsq21         = _fjsp_mul_v2r8(rinv21,rinv21);
+            rinvsq22         = _fjsp_mul_v2r8(rinv22,rinv22);
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+            fjx1             = _fjsp_setzero_v2r8();
+            fjy1             = _fjsp_setzero_v2r8();
+            fjz1             = _fjsp_setzero_v2r8();
+            fjx2             = _fjsp_setzero_v2r8();
+            fjy2             = _fjsp_setzero_v2r8();
+            fjz2             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
+            {
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r00,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq00,_fjsp_sub_v2r8(rinv00,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq00,rinv00),_fjsp_sub_v2r8(rinvsq00,felec));
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+            vvdw6            = _fjsp_mul_v2r8(c6_00,rinvsix);
+            vvdw12           = _fjsp_mul_v2r8(c12_00,_fjsp_mul_v2r8(rinvsix,rinvsix));
+            vvdw             = _fjsp_msub_v2r8( vvdw12,one_twelfth, _fjsp_mul_v2r8(vvdw6,one_sixth) );
+            fvdw             = _fjsp_mul_v2r8(_fjsp_sub_v2r8(vvdw12,vvdw6),rinvsq00);
+
+            d                = _fjsp_sub_v2r8(r00,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv00,_fjsp_mul_v2r8(velec,dsw)) );
+            fvdw             = _fjsp_msub_v2r8( fvdw,sw , _fjsp_mul_v2r8(rinv00,_fjsp_mul_v2r8(vvdw,dsw)) );
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
+
+            fscal            = _fjsp_add_v2r8(felec,fvdw);
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq01,rcutoff2))
+            {
+
+            r01              = _fjsp_mul_v2r8(rsq01,rinv01);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r01,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq01,_fjsp_sub_v2r8(rinv01,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq01,rinv01),_fjsp_sub_v2r8(rinvsq01,felec));
+
+            d                = _fjsp_sub_v2r8(r01,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv01,_fjsp_mul_v2r8(velec,dsw)) );
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq01,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx01,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy01,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz01,fscal,fiz0);
+            
+            fjx1             = _fjsp_madd_v2r8(dx01,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy01,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz01,fscal,fjz1);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq02,rcutoff2))
+            {
+
+            r02              = _fjsp_mul_v2r8(rsq02,rinv02);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r02,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq02,_fjsp_sub_v2r8(rinv02,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq02,rinv02),_fjsp_sub_v2r8(rinvsq02,felec));
+
+            d                = _fjsp_sub_v2r8(r02,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv02,_fjsp_mul_v2r8(velec,dsw)) );
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq02,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx02,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy02,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz02,fscal,fiz0);
+            
+            fjx2             = _fjsp_madd_v2r8(dx02,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy02,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz02,fscal,fjz2);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq10,rcutoff2))
+            {
+
+            r10              = _fjsp_mul_v2r8(rsq10,rinv10);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r10,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq10,_fjsp_sub_v2r8(rinv10,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq10,rinv10),_fjsp_sub_v2r8(rinvsq10,felec));
+
+            d                = _fjsp_sub_v2r8(r10,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv10,_fjsp_mul_v2r8(velec,dsw)) );
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq10,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq11,rcutoff2))
+            {
+
+            r11              = _fjsp_mul_v2r8(rsq11,rinv11);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r11,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq11,_fjsp_sub_v2r8(rinv11,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq11,rinv11),_fjsp_sub_v2r8(rinvsq11,felec));
+
+            d                = _fjsp_sub_v2r8(r11,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv11,_fjsp_mul_v2r8(velec,dsw)) );
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq11,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+            
+            fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq12,rcutoff2))
+            {
+
+            r12              = _fjsp_mul_v2r8(rsq12,rinv12);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r12,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq12,_fjsp_sub_v2r8(rinv12,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq12,rinv12),_fjsp_sub_v2r8(rinvsq12,felec));
+
+            d                = _fjsp_sub_v2r8(r12,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv12,_fjsp_mul_v2r8(velec,dsw)) );
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq12,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+            
+            fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq20,rcutoff2))
+            {
+
+            r20              = _fjsp_mul_v2r8(rsq20,rinv20);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r20,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq20,_fjsp_sub_v2r8(rinv20,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq20,rinv20),_fjsp_sub_v2r8(rinvsq20,felec));
+
+            d                = _fjsp_sub_v2r8(r20,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv20,_fjsp_mul_v2r8(velec,dsw)) );
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq20,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq21,rcutoff2))
+            {
+
+            r21              = _fjsp_mul_v2r8(rsq21,rinv21);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r21,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq21,_fjsp_sub_v2r8(rinv21,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq21,rinv21),_fjsp_sub_v2r8(rinvsq21,felec));
+
+            d                = _fjsp_sub_v2r8(r21,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv21,_fjsp_mul_v2r8(velec,dsw)) );
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq21,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+            
+            fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq22,rcutoff2))
+            {
+
+            r22              = _fjsp_mul_v2r8(rsq22,rinv22);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r22,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq22,_fjsp_sub_v2r8(rinv22,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq22,rinv22),_fjsp_sub_v2r8(rinvsq22,felec));
+
+            d                = _fjsp_sub_v2r8(r22,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv22,_fjsp_mul_v2r8(velec,dsw)) );
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq22,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+            
+            fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+
+            }
+
+            gmx_fjsp_decrement_3rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+
+            /* Inner loop uses 600 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_3rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx01             = _fjsp_sub_v2r8(ix0,jx1);
+            dy01             = _fjsp_sub_v2r8(iy0,jy1);
+            dz01             = _fjsp_sub_v2r8(iz0,jz1);
+            dx02             = _fjsp_sub_v2r8(ix0,jx2);
+            dy02             = _fjsp_sub_v2r8(iy0,jy2);
+            dz02             = _fjsp_sub_v2r8(iz0,jz2);
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx11             = _fjsp_sub_v2r8(ix1,jx1);
+            dy11             = _fjsp_sub_v2r8(iy1,jy1);
+            dz11             = _fjsp_sub_v2r8(iz1,jz1);
+            dx12             = _fjsp_sub_v2r8(ix1,jx2);
+            dy12             = _fjsp_sub_v2r8(iy1,jy2);
+            dz12             = _fjsp_sub_v2r8(iz1,jz2);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+            dx21             = _fjsp_sub_v2r8(ix2,jx1);
+            dy21             = _fjsp_sub_v2r8(iy2,jy1);
+            dz21             = _fjsp_sub_v2r8(iz2,jz1);
+            dx22             = _fjsp_sub_v2r8(ix2,jx2);
+            dy22             = _fjsp_sub_v2r8(iy2,jy2);
+            dz22             = _fjsp_sub_v2r8(iz2,jz2);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq01            = gmx_fjsp_calc_rsq_v2r8(dx01,dy01,dz01);
+            rsq02            = gmx_fjsp_calc_rsq_v2r8(dx02,dy02,dz02);
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+            rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+            rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+            rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv01           = gmx_fjsp_invsqrt_v2r8(rsq01);
+            rinv02           = gmx_fjsp_invsqrt_v2r8(rsq02);
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+            rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+            rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+            rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+            rinvsq01         = _fjsp_mul_v2r8(rinv01,rinv01);
+            rinvsq02         = _fjsp_mul_v2r8(rinv02,rinv02);
+            rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+            rinvsq11         = _fjsp_mul_v2r8(rinv11,rinv11);
+            rinvsq12         = _fjsp_mul_v2r8(rinv12,rinv12);
+            rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+            rinvsq21         = _fjsp_mul_v2r8(rinv21,rinv21);
+            rinvsq22         = _fjsp_mul_v2r8(rinv22,rinv22);
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+            fjx1             = _fjsp_setzero_v2r8();
+            fjy1             = _fjsp_setzero_v2r8();
+            fjz1             = _fjsp_setzero_v2r8();
+            fjx2             = _fjsp_setzero_v2r8();
+            fjy2             = _fjsp_setzero_v2r8();
+            fjz2             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
+            {
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r00,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq00,_fjsp_sub_v2r8(rinv00,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq00,rinv00),_fjsp_sub_v2r8(rinvsq00,felec));
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+            vvdw6            = _fjsp_mul_v2r8(c6_00,rinvsix);
+            vvdw12           = _fjsp_mul_v2r8(c12_00,_fjsp_mul_v2r8(rinvsix,rinvsix));
+            vvdw             = _fjsp_msub_v2r8( vvdw12,one_twelfth, _fjsp_mul_v2r8(vvdw6,one_sixth) );
+            fvdw             = _fjsp_mul_v2r8(_fjsp_sub_v2r8(vvdw12,vvdw6),rinvsq00);
+
+            d                = _fjsp_sub_v2r8(r00,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv00,_fjsp_mul_v2r8(velec,dsw)) );
+            fvdw             = _fjsp_msub_v2r8( fvdw,sw , _fjsp_mul_v2r8(rinv00,_fjsp_mul_v2r8(vvdw,dsw)) );
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
+
+            fscal            = _fjsp_add_v2r8(felec,fvdw);
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq01,rcutoff2))
+            {
+
+            r01              = _fjsp_mul_v2r8(rsq01,rinv01);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r01,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq01,_fjsp_sub_v2r8(rinv01,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq01,rinv01),_fjsp_sub_v2r8(rinvsq01,felec));
+
+            d                = _fjsp_sub_v2r8(r01,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv01,_fjsp_mul_v2r8(velec,dsw)) );
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq01,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx01,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy01,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz01,fscal,fiz0);
+            
+            fjx1             = _fjsp_madd_v2r8(dx01,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy01,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz01,fscal,fjz1);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq02,rcutoff2))
+            {
+
+            r02              = _fjsp_mul_v2r8(rsq02,rinv02);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r02,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq02,_fjsp_sub_v2r8(rinv02,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq02,rinv02),_fjsp_sub_v2r8(rinvsq02,felec));
+
+            d                = _fjsp_sub_v2r8(r02,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv02,_fjsp_mul_v2r8(velec,dsw)) );
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq02,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx02,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy02,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz02,fscal,fiz0);
+            
+            fjx2             = _fjsp_madd_v2r8(dx02,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy02,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz02,fscal,fjz2);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq10,rcutoff2))
+            {
+
+            r10              = _fjsp_mul_v2r8(rsq10,rinv10);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r10,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq10,_fjsp_sub_v2r8(rinv10,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq10,rinv10),_fjsp_sub_v2r8(rinvsq10,felec));
+
+            d                = _fjsp_sub_v2r8(r10,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv10,_fjsp_mul_v2r8(velec,dsw)) );
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq10,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq11,rcutoff2))
+            {
+
+            r11              = _fjsp_mul_v2r8(rsq11,rinv11);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r11,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq11,_fjsp_sub_v2r8(rinv11,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq11,rinv11),_fjsp_sub_v2r8(rinvsq11,felec));
+
+            d                = _fjsp_sub_v2r8(r11,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv11,_fjsp_mul_v2r8(velec,dsw)) );
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq11,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+            
+            fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq12,rcutoff2))
+            {
+
+            r12              = _fjsp_mul_v2r8(rsq12,rinv12);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r12,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq12,_fjsp_sub_v2r8(rinv12,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq12,rinv12),_fjsp_sub_v2r8(rinvsq12,felec));
+
+            d                = _fjsp_sub_v2r8(r12,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv12,_fjsp_mul_v2r8(velec,dsw)) );
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq12,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+            
+            fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq20,rcutoff2))
+            {
+
+            r20              = _fjsp_mul_v2r8(rsq20,rinv20);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r20,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq20,_fjsp_sub_v2r8(rinv20,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq20,rinv20),_fjsp_sub_v2r8(rinvsq20,felec));
+
+            d                = _fjsp_sub_v2r8(r20,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv20,_fjsp_mul_v2r8(velec,dsw)) );
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq20,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq21,rcutoff2))
+            {
+
+            r21              = _fjsp_mul_v2r8(rsq21,rinv21);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r21,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq21,_fjsp_sub_v2r8(rinv21,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq21,rinv21),_fjsp_sub_v2r8(rinvsq21,felec));
+
+            d                = _fjsp_sub_v2r8(r21,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv21,_fjsp_mul_v2r8(velec,dsw)) );
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq21,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+            
+            fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq22,rcutoff2))
+            {
+
+            r22              = _fjsp_mul_v2r8(rsq22,rinv22);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r22,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq22,_fjsp_sub_v2r8(rinv22,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq22,rinv22),_fjsp_sub_v2r8(rinvsq22,felec));
+
+            d                = _fjsp_sub_v2r8(r22,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv22,_fjsp_mul_v2r8(velec,dsw)) );
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq22,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+            
+            fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+
+            }
+
+            gmx_fjsp_decrement_3rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+
+            /* Inner loop uses 600 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_fjsp_update_iforce_3atom_swizzle_v2r8(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 18 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_F,outeriter*18 + inneriter*600);
+}
diff --git a/src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecEwSw_VdwLJSw_GeomW4P1_sparc64_hpc_ace_double.c b/src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecEwSw_VdwLJSw_GeomW4P1_sparc64_hpc_ace_double.c
new file mode 100644 (file)
index 0000000..1a8925d
--- /dev/null
@@ -0,0 +1,1557 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2012, by the GROMACS development team, led by
+ * David van der Spoel, Berk Hess, Erik Lindahl, and including many
+ * others, as listed in the AUTHORS file in the top-level source
+ * directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*
+ * Note: this file was generated by the GROMACS sparc64_hpc_ace_double kernel generator.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "kernelutil_sparc64_hpc_ace_double.h"
+
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecEwSw_VdwLJSw_GeomW4P1_VF_sparc64_hpc_ace_double
+ * Electrostatics interaction: Ewald
+ * VdW interaction:            LennardJones
+ * Geometry:                   Water4-Particle
+ * Calculate force/pot:        PotentialAndForce
+ */
+void
+nb_kernel_ElecEwSw_VdwLJSw_GeomW4P1_VF_sparc64_hpc_ace_double
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwioffset3;
+    _fjsp_v2r8       ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+    int              vdwjidx0A,vdwjidx0B;
+    _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    _fjsp_v2r8       dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    _fjsp_v2r8       dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    _fjsp_v2r8       dx30,dy30,dz30,rsq30,rinv30,rinvsq30,r30,qq30,c6_30,c12_30;
+    _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
+    _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
+    _fjsp_v2r8       ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+    real             *ewtab;
+    _fjsp_v2r8       rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
+    real             rswitch_scalar,d_scalar;
+    _fjsp_v2r8       itab_tmp;
+    _fjsp_v2r8       dummy_mask,cutoff_mask;
+    _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+    _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+    union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+
+    sh_ewald         = gmx_fjsp_set1_v2r8(fr->ic->sh_ewald);
+    ewtab            = fr->ic->tabq_coul_FDV0;
+    ewtabscale       = gmx_fjsp_set1_v2r8(fr->ic->tabq_scale);
+    ewtabhalfspace   = gmx_fjsp_set1_v2r8(0.5/fr->ic->tabq_scale);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+    iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+    iq3              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+3]));
+    vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+    /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+    rcutoff_scalar   = fr->rcoulomb;
+    rcutoff          = gmx_fjsp_set1_v2r8(rcutoff_scalar);
+    rcutoff2         = _fjsp_mul_v2r8(rcutoff,rcutoff);
+
+    rswitch_scalar   = fr->rcoulomb_switch;
+    rswitch          = gmx_fjsp_set1_v2r8(rswitch_scalar);
+    /* Setup switch parameters */
+    d_scalar         = rcutoff_scalar-rswitch_scalar;
+    d                = gmx_fjsp_set1_v2r8(d_scalar);
+    swV3             = gmx_fjsp_set1_v2r8(-10.0/(d_scalar*d_scalar*d_scalar));
+    swV4             = gmx_fjsp_set1_v2r8( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar));
+    swV5             = gmx_fjsp_set1_v2r8( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
+    swF2             = gmx_fjsp_set1_v2r8(-30.0/(d_scalar*d_scalar*d_scalar));
+    swF3             = gmx_fjsp_set1_v2r8( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar));
+    swF4             = gmx_fjsp_set1_v2r8(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_fjsp_load_shift_and_4rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
+
+        fix0             = _fjsp_setzero_v2r8();
+        fiy0             = _fjsp_setzero_v2r8();
+        fiz0             = _fjsp_setzero_v2r8();
+        fix1             = _fjsp_setzero_v2r8();
+        fiy1             = _fjsp_setzero_v2r8();
+        fiz1             = _fjsp_setzero_v2r8();
+        fix2             = _fjsp_setzero_v2r8();
+        fiy2             = _fjsp_setzero_v2r8();
+        fiz2             = _fjsp_setzero_v2r8();
+        fix3             = _fjsp_setzero_v2r8();
+        fiy3             = _fjsp_setzero_v2r8();
+        fiz3             = _fjsp_setzero_v2r8();
+
+        /* Reset potential sums */
+        velecsum         = _fjsp_setzero_v2r8();
+        vvdwsum          = _fjsp_setzero_v2r8();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+            dx30             = _fjsp_sub_v2r8(ix3,jx0);
+            dy30             = _fjsp_sub_v2r8(iy3,jy0);
+            dz30             = _fjsp_sub_v2r8(iz3,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+            rsq30            = gmx_fjsp_calc_rsq_v2r8(dx30,dy30,dz30);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+            rinv30           = gmx_fjsp_invsqrt_v2r8(rsq30);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+            rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+            rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+            rinvsq30         = _fjsp_mul_v2r8(rinv30,rinv30);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_fjsp_load_2real_swizzle_v2r8(charge+jnrA+0,charge+jnrB+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
+            {
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            gmx_fjsp_load_2pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,&c6_00,&c12_00);
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+            vvdw6            = _fjsp_mul_v2r8(c6_00,rinvsix);
+            vvdw12           = _fjsp_mul_v2r8(c12_00,_fjsp_mul_v2r8(rinvsix,rinvsix));
+            vvdw             = _fjsp_msub_v2r8( vvdw12,one_twelfth, _fjsp_mul_v2r8(vvdw6,one_sixth) );
+            fvdw             = _fjsp_mul_v2r8(_fjsp_sub_v2r8(vvdw12,vvdw6),rinvsq00);
+
+            d                = _fjsp_sub_v2r8(r00,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            fvdw             = _fjsp_msub_v2r8( fvdw,sw , _fjsp_mul_v2r8(rinv00,_fjsp_mul_v2r8(vvdw,dsw)) );
+            vvdw             = _fjsp_mul_v2r8(vvdw,sw);
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            vvdw             = _fjsp_and_v2r8(vvdw,cutoff_mask);
+            vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
+
+            fscal            = fvdw;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq10,rcutoff2))
+            {
+
+            r10              = _fjsp_mul_v2r8(rsq10,rinv10);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _fjsp_mul_v2r8(iq1,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r10,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq10,_fjsp_sub_v2r8(rinv10,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq10,rinv10),_fjsp_sub_v2r8(rinvsq10,felec));
+
+            d                = _fjsp_sub_v2r8(r10,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv10,_fjsp_mul_v2r8(velec,dsw)) );
+            velec            = _fjsp_mul_v2r8(velec,sw);
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq10,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq20,rcutoff2))
+            {
+
+            r20              = _fjsp_mul_v2r8(rsq20,rinv20);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _fjsp_mul_v2r8(iq2,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r20,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq20,_fjsp_sub_v2r8(rinv20,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq20,rinv20),_fjsp_sub_v2r8(rinvsq20,felec));
+
+            d                = _fjsp_sub_v2r8(r20,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv20,_fjsp_mul_v2r8(velec,dsw)) );
+            velec            = _fjsp_mul_v2r8(velec,sw);
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq20,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq30,rcutoff2))
+            {
+
+            r30              = _fjsp_mul_v2r8(rsq30,rinv30);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq30             = _fjsp_mul_v2r8(iq3,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r30,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq30,_fjsp_sub_v2r8(rinv30,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq30,rinv30),_fjsp_sub_v2r8(rinvsq30,felec));
+
+            d                = _fjsp_sub_v2r8(r30,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv30,_fjsp_mul_v2r8(velec,dsw)) );
+            velec            = _fjsp_mul_v2r8(velec,sw);
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq30,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx30,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy30,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz30,fscal,fiz3);
+            
+            fjx0             = _fjsp_madd_v2r8(dx30,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy30,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz30,fscal,fjz0);
+
+            }
+
+            gmx_fjsp_decrement_1rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 269 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+            dx30             = _fjsp_sub_v2r8(ix3,jx0);
+            dy30             = _fjsp_sub_v2r8(iy3,jy0);
+            dz30             = _fjsp_sub_v2r8(iz3,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+            rsq30            = gmx_fjsp_calc_rsq_v2r8(dx30,dy30,dz30);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+            rinv30           = gmx_fjsp_invsqrt_v2r8(rsq30);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+            rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+            rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+            rinvsq30         = _fjsp_mul_v2r8(rinv30,rinv30);
+
+            /* Load parameters for j particles */
+            jq0              = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),charge+jnrA+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
+            {
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            gmx_fjsp_load_1pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,&c6_00,&c12_00);
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+            vvdw6            = _fjsp_mul_v2r8(c6_00,rinvsix);
+            vvdw12           = _fjsp_mul_v2r8(c12_00,_fjsp_mul_v2r8(rinvsix,rinvsix));
+            vvdw             = _fjsp_msub_v2r8( vvdw12,one_twelfth, _fjsp_mul_v2r8(vvdw6,one_sixth) );
+            fvdw             = _fjsp_mul_v2r8(_fjsp_sub_v2r8(vvdw12,vvdw6),rinvsq00);
+
+            d                = _fjsp_sub_v2r8(r00,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            fvdw             = _fjsp_msub_v2r8( fvdw,sw , _fjsp_mul_v2r8(rinv00,_fjsp_mul_v2r8(vvdw,dsw)) );
+            vvdw             = _fjsp_mul_v2r8(vvdw,sw);
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            vvdw             = _fjsp_and_v2r8(vvdw,cutoff_mask);
+            vvdw             = _fjsp_unpacklo_v2r8(vvdw,_fjsp_setzero_v2r8());
+            vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
+
+            fscal            = fvdw;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq10,rcutoff2))
+            {
+
+            r10              = _fjsp_mul_v2r8(rsq10,rinv10);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _fjsp_mul_v2r8(iq1,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r10,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq10,_fjsp_sub_v2r8(rinv10,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq10,rinv10),_fjsp_sub_v2r8(rinvsq10,felec));
+
+            d                = _fjsp_sub_v2r8(r10,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv10,_fjsp_mul_v2r8(velec,dsw)) );
+            velec            = _fjsp_mul_v2r8(velec,sw);
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq10,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq20,rcutoff2))
+            {
+
+            r20              = _fjsp_mul_v2r8(rsq20,rinv20);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _fjsp_mul_v2r8(iq2,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r20,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq20,_fjsp_sub_v2r8(rinv20,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq20,rinv20),_fjsp_sub_v2r8(rinvsq20,felec));
+
+            d                = _fjsp_sub_v2r8(r20,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv20,_fjsp_mul_v2r8(velec,dsw)) );
+            velec            = _fjsp_mul_v2r8(velec,sw);
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq20,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq30,rcutoff2))
+            {
+
+            r30              = _fjsp_mul_v2r8(rsq30,rinv30);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq30             = _fjsp_mul_v2r8(iq3,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r30,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq30,_fjsp_sub_v2r8(rinv30,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq30,rinv30),_fjsp_sub_v2r8(rinvsq30,felec));
+
+            d                = _fjsp_sub_v2r8(r30,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv30,_fjsp_mul_v2r8(velec,dsw)) );
+            velec            = _fjsp_mul_v2r8(velec,sw);
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq30,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx30,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy30,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz30,fscal,fiz3);
+            
+            fjx0             = _fjsp_madd_v2r8(dx30,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy30,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz30,fscal,fjz0);
+
+            }
+
+            gmx_fjsp_decrement_1rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 269 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_fjsp_update_iforce_4atom_swizzle_v2r8(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        ggid                        = gid[iidx];
+        /* Update potential energies */
+        gmx_fjsp_update_1pot_v2r8(velecsum,kernel_data->energygrp_elec+ggid);
+        gmx_fjsp_update_1pot_v2r8(vvdwsum,kernel_data->energygrp_vdw+ggid);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 26 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4_VF,outeriter*26 + inneriter*269);
+}
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecEwSw_VdwLJSw_GeomW4P1_F_sparc64_hpc_ace_double
+ * Electrostatics interaction: Ewald
+ * VdW interaction:            LennardJones
+ * Geometry:                   Water4-Particle
+ * Calculate force/pot:        Force
+ */
+void
+nb_kernel_ElecEwSw_VdwLJSw_GeomW4P1_F_sparc64_hpc_ace_double
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwioffset3;
+    _fjsp_v2r8       ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+    int              vdwjidx0A,vdwjidx0B;
+    _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    _fjsp_v2r8       dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    _fjsp_v2r8       dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    _fjsp_v2r8       dx30,dy30,dz30,rsq30,rinv30,rinvsq30,r30,qq30,c6_30,c12_30;
+    _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
+    _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
+    _fjsp_v2r8       ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+    real             *ewtab;
+    _fjsp_v2r8       rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
+    real             rswitch_scalar,d_scalar;
+    _fjsp_v2r8       itab_tmp;
+    _fjsp_v2r8       dummy_mask,cutoff_mask;
+    _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+    _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+    union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+
+    sh_ewald         = gmx_fjsp_set1_v2r8(fr->ic->sh_ewald);
+    ewtab            = fr->ic->tabq_coul_FDV0;
+    ewtabscale       = gmx_fjsp_set1_v2r8(fr->ic->tabq_scale);
+    ewtabhalfspace   = gmx_fjsp_set1_v2r8(0.5/fr->ic->tabq_scale);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+    iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+    iq3              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+3]));
+    vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+    /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+    rcutoff_scalar   = fr->rcoulomb;
+    rcutoff          = gmx_fjsp_set1_v2r8(rcutoff_scalar);
+    rcutoff2         = _fjsp_mul_v2r8(rcutoff,rcutoff);
+
+    rswitch_scalar   = fr->rcoulomb_switch;
+    rswitch          = gmx_fjsp_set1_v2r8(rswitch_scalar);
+    /* Setup switch parameters */
+    d_scalar         = rcutoff_scalar-rswitch_scalar;
+    d                = gmx_fjsp_set1_v2r8(d_scalar);
+    swV3             = gmx_fjsp_set1_v2r8(-10.0/(d_scalar*d_scalar*d_scalar));
+    swV4             = gmx_fjsp_set1_v2r8( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar));
+    swV5             = gmx_fjsp_set1_v2r8( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
+    swF2             = gmx_fjsp_set1_v2r8(-30.0/(d_scalar*d_scalar*d_scalar));
+    swF3             = gmx_fjsp_set1_v2r8( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar));
+    swF4             = gmx_fjsp_set1_v2r8(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_fjsp_load_shift_and_4rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
+
+        fix0             = _fjsp_setzero_v2r8();
+        fiy0             = _fjsp_setzero_v2r8();
+        fiz0             = _fjsp_setzero_v2r8();
+        fix1             = _fjsp_setzero_v2r8();
+        fiy1             = _fjsp_setzero_v2r8();
+        fiz1             = _fjsp_setzero_v2r8();
+        fix2             = _fjsp_setzero_v2r8();
+        fiy2             = _fjsp_setzero_v2r8();
+        fiz2             = _fjsp_setzero_v2r8();
+        fix3             = _fjsp_setzero_v2r8();
+        fiy3             = _fjsp_setzero_v2r8();
+        fiz3             = _fjsp_setzero_v2r8();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+            dx30             = _fjsp_sub_v2r8(ix3,jx0);
+            dy30             = _fjsp_sub_v2r8(iy3,jy0);
+            dz30             = _fjsp_sub_v2r8(iz3,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+            rsq30            = gmx_fjsp_calc_rsq_v2r8(dx30,dy30,dz30);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+            rinv30           = gmx_fjsp_invsqrt_v2r8(rsq30);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+            rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+            rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+            rinvsq30         = _fjsp_mul_v2r8(rinv30,rinv30);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_fjsp_load_2real_swizzle_v2r8(charge+jnrA+0,charge+jnrB+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
+            {
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            gmx_fjsp_load_2pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,&c6_00,&c12_00);
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+            vvdw6            = _fjsp_mul_v2r8(c6_00,rinvsix);
+            vvdw12           = _fjsp_mul_v2r8(c12_00,_fjsp_mul_v2r8(rinvsix,rinvsix));
+            vvdw             = _fjsp_msub_v2r8( vvdw12,one_twelfth, _fjsp_mul_v2r8(vvdw6,one_sixth) );
+            fvdw             = _fjsp_mul_v2r8(_fjsp_sub_v2r8(vvdw12,vvdw6),rinvsq00);
+
+            d                = _fjsp_sub_v2r8(r00,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            fvdw             = _fjsp_msub_v2r8( fvdw,sw , _fjsp_mul_v2r8(rinv00,_fjsp_mul_v2r8(vvdw,dsw)) );
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
+
+            fscal            = fvdw;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq10,rcutoff2))
+            {
+
+            r10              = _fjsp_mul_v2r8(rsq10,rinv10);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _fjsp_mul_v2r8(iq1,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r10,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq10,_fjsp_sub_v2r8(rinv10,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq10,rinv10),_fjsp_sub_v2r8(rinvsq10,felec));
+
+            d                = _fjsp_sub_v2r8(r10,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv10,_fjsp_mul_v2r8(velec,dsw)) );
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq10,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq20,rcutoff2))
+            {
+
+            r20              = _fjsp_mul_v2r8(rsq20,rinv20);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _fjsp_mul_v2r8(iq2,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r20,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq20,_fjsp_sub_v2r8(rinv20,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq20,rinv20),_fjsp_sub_v2r8(rinvsq20,felec));
+
+            d                = _fjsp_sub_v2r8(r20,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv20,_fjsp_mul_v2r8(velec,dsw)) );
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq20,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq30,rcutoff2))
+            {
+
+            r30              = _fjsp_mul_v2r8(rsq30,rinv30);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq30             = _fjsp_mul_v2r8(iq3,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r30,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq30,_fjsp_sub_v2r8(rinv30,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq30,rinv30),_fjsp_sub_v2r8(rinvsq30,felec));
+
+            d                = _fjsp_sub_v2r8(r30,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv30,_fjsp_mul_v2r8(velec,dsw)) );
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq30,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx30,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy30,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz30,fscal,fiz3);
+            
+            fjx0             = _fjsp_madd_v2r8(dx30,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy30,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz30,fscal,fjz0);
+
+            }
+
+            gmx_fjsp_decrement_1rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 257 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+            dx30             = _fjsp_sub_v2r8(ix3,jx0);
+            dy30             = _fjsp_sub_v2r8(iy3,jy0);
+            dz30             = _fjsp_sub_v2r8(iz3,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+            rsq30            = gmx_fjsp_calc_rsq_v2r8(dx30,dy30,dz30);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+            rinv30           = gmx_fjsp_invsqrt_v2r8(rsq30);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+            rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+            rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+            rinvsq30         = _fjsp_mul_v2r8(rinv30,rinv30);
+
+            /* Load parameters for j particles */
+            jq0              = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),charge+jnrA+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
+            {
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            gmx_fjsp_load_1pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,&c6_00,&c12_00);
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+            vvdw6            = _fjsp_mul_v2r8(c6_00,rinvsix);
+            vvdw12           = _fjsp_mul_v2r8(c12_00,_fjsp_mul_v2r8(rinvsix,rinvsix));
+            vvdw             = _fjsp_msub_v2r8( vvdw12,one_twelfth, _fjsp_mul_v2r8(vvdw6,one_sixth) );
+            fvdw             = _fjsp_mul_v2r8(_fjsp_sub_v2r8(vvdw12,vvdw6),rinvsq00);
+
+            d                = _fjsp_sub_v2r8(r00,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            fvdw             = _fjsp_msub_v2r8( fvdw,sw , _fjsp_mul_v2r8(rinv00,_fjsp_mul_v2r8(vvdw,dsw)) );
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
+
+            fscal            = fvdw;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq10,rcutoff2))
+            {
+
+            r10              = _fjsp_mul_v2r8(rsq10,rinv10);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _fjsp_mul_v2r8(iq1,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r10,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq10,_fjsp_sub_v2r8(rinv10,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq10,rinv10),_fjsp_sub_v2r8(rinvsq10,felec));
+
+            d                = _fjsp_sub_v2r8(r10,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv10,_fjsp_mul_v2r8(velec,dsw)) );
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq10,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq20,rcutoff2))
+            {
+
+            r20              = _fjsp_mul_v2r8(rsq20,rinv20);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _fjsp_mul_v2r8(iq2,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r20,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq20,_fjsp_sub_v2r8(rinv20,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq20,rinv20),_fjsp_sub_v2r8(rinvsq20,felec));
+
+            d                = _fjsp_sub_v2r8(r20,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv20,_fjsp_mul_v2r8(velec,dsw)) );
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq20,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq30,rcutoff2))
+            {
+
+            r30              = _fjsp_mul_v2r8(rsq30,rinv30);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq30             = _fjsp_mul_v2r8(iq3,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r30,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq30,_fjsp_sub_v2r8(rinv30,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq30,rinv30),_fjsp_sub_v2r8(rinvsq30,felec));
+
+            d                = _fjsp_sub_v2r8(r30,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv30,_fjsp_mul_v2r8(velec,dsw)) );
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq30,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx30,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy30,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz30,fscal,fiz3);
+            
+            fjx0             = _fjsp_madd_v2r8(dx30,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy30,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz30,fscal,fjz0);
+
+            }
+
+            gmx_fjsp_decrement_1rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 257 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_fjsp_update_iforce_4atom_swizzle_v2r8(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 24 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4_F,outeriter*24 + inneriter*257);
+}
diff --git a/src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecEwSw_VdwLJSw_GeomW4W4_sparc64_hpc_ace_double.c b/src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecEwSw_VdwLJSw_GeomW4W4_sparc64_hpc_ace_double.c
new file mode 100644 (file)
index 0000000..d76393f
--- /dev/null
@@ -0,0 +1,3139 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2012, by the GROMACS development team, led by
+ * David van der Spoel, Berk Hess, Erik Lindahl, and including many
+ * others, as listed in the AUTHORS file in the top-level source
+ * directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*
+ * Note: this file was generated by the GROMACS sparc64_hpc_ace_double kernel generator.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "kernelutil_sparc64_hpc_ace_double.h"
+
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecEwSw_VdwLJSw_GeomW4W4_VF_sparc64_hpc_ace_double
+ * Electrostatics interaction: Ewald
+ * VdW interaction:            LennardJones
+ * Geometry:                   Water4-Water4
+ * Calculate force/pot:        PotentialAndForce
+ */
+void
+nb_kernel_ElecEwSw_VdwLJSw_GeomW4W4_VF_sparc64_hpc_ace_double
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwioffset3;
+    _fjsp_v2r8       ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+    int              vdwjidx0A,vdwjidx0B;
+    _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    int              vdwjidx1A,vdwjidx1B;
+    _fjsp_v2r8       jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+    int              vdwjidx2A,vdwjidx2B;
+    _fjsp_v2r8       jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+    int              vdwjidx3A,vdwjidx3B;
+    _fjsp_v2r8       jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
+    _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    _fjsp_v2r8       dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+    _fjsp_v2r8       dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+    _fjsp_v2r8       dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
+    _fjsp_v2r8       dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+    _fjsp_v2r8       dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+    _fjsp_v2r8       dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
+    _fjsp_v2r8       dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
+    _fjsp_v2r8       dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
+    _fjsp_v2r8       dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
+    _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
+    _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
+    _fjsp_v2r8       ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+    real             *ewtab;
+    _fjsp_v2r8       rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
+    real             rswitch_scalar,d_scalar;
+    _fjsp_v2r8       itab_tmp;
+    _fjsp_v2r8       dummy_mask,cutoff_mask;
+    _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+    _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+    union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+
+    sh_ewald         = gmx_fjsp_set1_v2r8(fr->ic->sh_ewald);
+    ewtab            = fr->ic->tabq_coul_FDV0;
+    ewtabscale       = gmx_fjsp_set1_v2r8(fr->ic->tabq_scale);
+    ewtabhalfspace   = gmx_fjsp_set1_v2r8(0.5/fr->ic->tabq_scale);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+    iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+    iq3              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+3]));
+    vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+    jq1              = gmx_fjsp_set1_v2r8(charge[inr+1]);
+    jq2              = gmx_fjsp_set1_v2r8(charge[inr+2]);
+    jq3              = gmx_fjsp_set1_v2r8(charge[inr+3]);
+    vdwjidx0A        = 2*vdwtype[inr+0];
+    c6_00            = gmx_fjsp_set1_v2r8(vdwparam[vdwioffset0+vdwjidx0A]);
+    c12_00           = gmx_fjsp_set1_v2r8(vdwparam[vdwioffset0+vdwjidx0A+1]);
+    qq11             = _fjsp_mul_v2r8(iq1,jq1);
+    qq12             = _fjsp_mul_v2r8(iq1,jq2);
+    qq13             = _fjsp_mul_v2r8(iq1,jq3);
+    qq21             = _fjsp_mul_v2r8(iq2,jq1);
+    qq22             = _fjsp_mul_v2r8(iq2,jq2);
+    qq23             = _fjsp_mul_v2r8(iq2,jq3);
+    qq31             = _fjsp_mul_v2r8(iq3,jq1);
+    qq32             = _fjsp_mul_v2r8(iq3,jq2);
+    qq33             = _fjsp_mul_v2r8(iq3,jq3);
+
+    /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+    rcutoff_scalar   = fr->rcoulomb;
+    rcutoff          = gmx_fjsp_set1_v2r8(rcutoff_scalar);
+    rcutoff2         = _fjsp_mul_v2r8(rcutoff,rcutoff);
+
+    rswitch_scalar   = fr->rcoulomb_switch;
+    rswitch          = gmx_fjsp_set1_v2r8(rswitch_scalar);
+    /* Setup switch parameters */
+    d_scalar         = rcutoff_scalar-rswitch_scalar;
+    d                = gmx_fjsp_set1_v2r8(d_scalar);
+    swV3             = gmx_fjsp_set1_v2r8(-10.0/(d_scalar*d_scalar*d_scalar));
+    swV4             = gmx_fjsp_set1_v2r8( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar));
+    swV5             = gmx_fjsp_set1_v2r8( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
+    swF2             = gmx_fjsp_set1_v2r8(-30.0/(d_scalar*d_scalar*d_scalar));
+    swF3             = gmx_fjsp_set1_v2r8( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar));
+    swF4             = gmx_fjsp_set1_v2r8(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_fjsp_load_shift_and_4rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
+
+        fix0             = _fjsp_setzero_v2r8();
+        fiy0             = _fjsp_setzero_v2r8();
+        fiz0             = _fjsp_setzero_v2r8();
+        fix1             = _fjsp_setzero_v2r8();
+        fiy1             = _fjsp_setzero_v2r8();
+        fiz1             = _fjsp_setzero_v2r8();
+        fix2             = _fjsp_setzero_v2r8();
+        fiy2             = _fjsp_setzero_v2r8();
+        fiz2             = _fjsp_setzero_v2r8();
+        fix3             = _fjsp_setzero_v2r8();
+        fiy3             = _fjsp_setzero_v2r8();
+        fiz3             = _fjsp_setzero_v2r8();
+
+        /* Reset potential sums */
+        velecsum         = _fjsp_setzero_v2r8();
+        vvdwsum          = _fjsp_setzero_v2r8();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_4rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
+                                              &jy2,&jz2,&jx3,&jy3,&jz3);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx11             = _fjsp_sub_v2r8(ix1,jx1);
+            dy11             = _fjsp_sub_v2r8(iy1,jy1);
+            dz11             = _fjsp_sub_v2r8(iz1,jz1);
+            dx12             = _fjsp_sub_v2r8(ix1,jx2);
+            dy12             = _fjsp_sub_v2r8(iy1,jy2);
+            dz12             = _fjsp_sub_v2r8(iz1,jz2);
+            dx13             = _fjsp_sub_v2r8(ix1,jx3);
+            dy13             = _fjsp_sub_v2r8(iy1,jy3);
+            dz13             = _fjsp_sub_v2r8(iz1,jz3);
+            dx21             = _fjsp_sub_v2r8(ix2,jx1);
+            dy21             = _fjsp_sub_v2r8(iy2,jy1);
+            dz21             = _fjsp_sub_v2r8(iz2,jz1);
+            dx22             = _fjsp_sub_v2r8(ix2,jx2);
+            dy22             = _fjsp_sub_v2r8(iy2,jy2);
+            dz22             = _fjsp_sub_v2r8(iz2,jz2);
+            dx23             = _fjsp_sub_v2r8(ix2,jx3);
+            dy23             = _fjsp_sub_v2r8(iy2,jy3);
+            dz23             = _fjsp_sub_v2r8(iz2,jz3);
+            dx31             = _fjsp_sub_v2r8(ix3,jx1);
+            dy31             = _fjsp_sub_v2r8(iy3,jy1);
+            dz31             = _fjsp_sub_v2r8(iz3,jz1);
+            dx32             = _fjsp_sub_v2r8(ix3,jx2);
+            dy32             = _fjsp_sub_v2r8(iy3,jy2);
+            dz32             = _fjsp_sub_v2r8(iz3,jz2);
+            dx33             = _fjsp_sub_v2r8(ix3,jx3);
+            dy33             = _fjsp_sub_v2r8(iy3,jy3);
+            dz33             = _fjsp_sub_v2r8(iz3,jz3);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+            rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+            rsq13            = gmx_fjsp_calc_rsq_v2r8(dx13,dy13,dz13);
+            rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+            rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+            rsq23            = gmx_fjsp_calc_rsq_v2r8(dx23,dy23,dz23);
+            rsq31            = gmx_fjsp_calc_rsq_v2r8(dx31,dy31,dz31);
+            rsq32            = gmx_fjsp_calc_rsq_v2r8(dx32,dy32,dz32);
+            rsq33            = gmx_fjsp_calc_rsq_v2r8(dx33,dy33,dz33);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+            rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+            rinv13           = gmx_fjsp_invsqrt_v2r8(rsq13);
+            rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+            rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+            rinv23           = gmx_fjsp_invsqrt_v2r8(rsq23);
+            rinv31           = gmx_fjsp_invsqrt_v2r8(rsq31);
+            rinv32           = gmx_fjsp_invsqrt_v2r8(rsq32);
+            rinv33           = gmx_fjsp_invsqrt_v2r8(rsq33);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+            rinvsq11         = _fjsp_mul_v2r8(rinv11,rinv11);
+            rinvsq12         = _fjsp_mul_v2r8(rinv12,rinv12);
+            rinvsq13         = _fjsp_mul_v2r8(rinv13,rinv13);
+            rinvsq21         = _fjsp_mul_v2r8(rinv21,rinv21);
+            rinvsq22         = _fjsp_mul_v2r8(rinv22,rinv22);
+            rinvsq23         = _fjsp_mul_v2r8(rinv23,rinv23);
+            rinvsq31         = _fjsp_mul_v2r8(rinv31,rinv31);
+            rinvsq32         = _fjsp_mul_v2r8(rinv32,rinv32);
+            rinvsq33         = _fjsp_mul_v2r8(rinv33,rinv33);
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+            fjx1             = _fjsp_setzero_v2r8();
+            fjy1             = _fjsp_setzero_v2r8();
+            fjz1             = _fjsp_setzero_v2r8();
+            fjx2             = _fjsp_setzero_v2r8();
+            fjy2             = _fjsp_setzero_v2r8();
+            fjz2             = _fjsp_setzero_v2r8();
+            fjx3             = _fjsp_setzero_v2r8();
+            fjy3             = _fjsp_setzero_v2r8();
+            fjz3             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
+            {
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+            vvdw6            = _fjsp_mul_v2r8(c6_00,rinvsix);
+            vvdw12           = _fjsp_mul_v2r8(c12_00,_fjsp_mul_v2r8(rinvsix,rinvsix));
+            vvdw             = _fjsp_msub_v2r8( vvdw12,one_twelfth, _fjsp_mul_v2r8(vvdw6,one_sixth) );
+            fvdw             = _fjsp_mul_v2r8(_fjsp_sub_v2r8(vvdw12,vvdw6),rinvsq00);
+
+            d                = _fjsp_sub_v2r8(r00,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            fvdw             = _fjsp_msub_v2r8( fvdw,sw , _fjsp_mul_v2r8(rinv00,_fjsp_mul_v2r8(vvdw,dsw)) );
+            vvdw             = _fjsp_mul_v2r8(vvdw,sw);
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            vvdw             = _fjsp_and_v2r8(vvdw,cutoff_mask);
+            vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
+
+            fscal            = fvdw;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq11,rcutoff2))
+            {
+
+            r11              = _fjsp_mul_v2r8(rsq11,rinv11);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r11,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq11,_fjsp_sub_v2r8(rinv11,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq11,rinv11),_fjsp_sub_v2r8(rinvsq11,felec));
+
+            d                = _fjsp_sub_v2r8(r11,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv11,_fjsp_mul_v2r8(velec,dsw)) );
+            velec            = _fjsp_mul_v2r8(velec,sw);
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq11,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+            
+            fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq12,rcutoff2))
+            {
+
+            r12              = _fjsp_mul_v2r8(rsq12,rinv12);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r12,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq12,_fjsp_sub_v2r8(rinv12,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq12,rinv12),_fjsp_sub_v2r8(rinvsq12,felec));
+
+            d                = _fjsp_sub_v2r8(r12,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv12,_fjsp_mul_v2r8(velec,dsw)) );
+            velec            = _fjsp_mul_v2r8(velec,sw);
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq12,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+            
+            fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq13,rcutoff2))
+            {
+
+            r13              = _fjsp_mul_v2r8(rsq13,rinv13);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r13,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq13,_fjsp_sub_v2r8(rinv13,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq13,rinv13),_fjsp_sub_v2r8(rinvsq13,felec));
+
+            d                = _fjsp_sub_v2r8(r13,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv13,_fjsp_mul_v2r8(velec,dsw)) );
+            velec            = _fjsp_mul_v2r8(velec,sw);
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq13,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx13,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy13,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz13,fscal,fiz1);
+            
+            fjx3             = _fjsp_madd_v2r8(dx13,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy13,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz13,fscal,fjz3);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq21,rcutoff2))
+            {
+
+            r21              = _fjsp_mul_v2r8(rsq21,rinv21);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r21,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq21,_fjsp_sub_v2r8(rinv21,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq21,rinv21),_fjsp_sub_v2r8(rinvsq21,felec));
+
+            d                = _fjsp_sub_v2r8(r21,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv21,_fjsp_mul_v2r8(velec,dsw)) );
+            velec            = _fjsp_mul_v2r8(velec,sw);
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq21,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+            
+            fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq22,rcutoff2))
+            {
+
+            r22              = _fjsp_mul_v2r8(rsq22,rinv22);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r22,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq22,_fjsp_sub_v2r8(rinv22,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq22,rinv22),_fjsp_sub_v2r8(rinvsq22,felec));
+
+            d                = _fjsp_sub_v2r8(r22,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv22,_fjsp_mul_v2r8(velec,dsw)) );
+            velec            = _fjsp_mul_v2r8(velec,sw);
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq22,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+            
+            fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq23,rcutoff2))
+            {
+
+            r23              = _fjsp_mul_v2r8(rsq23,rinv23);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r23,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq23,_fjsp_sub_v2r8(rinv23,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq23,rinv23),_fjsp_sub_v2r8(rinvsq23,felec));
+
+            d                = _fjsp_sub_v2r8(r23,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv23,_fjsp_mul_v2r8(velec,dsw)) );
+            velec            = _fjsp_mul_v2r8(velec,sw);
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq23,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx23,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy23,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz23,fscal,fiz2);
+            
+            fjx3             = _fjsp_madd_v2r8(dx23,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy23,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz23,fscal,fjz3);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq31,rcutoff2))
+            {
+
+            r31              = _fjsp_mul_v2r8(rsq31,rinv31);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r31,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq31,_fjsp_sub_v2r8(rinv31,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq31,rinv31),_fjsp_sub_v2r8(rinvsq31,felec));
+
+            d                = _fjsp_sub_v2r8(r31,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv31,_fjsp_mul_v2r8(velec,dsw)) );
+            velec            = _fjsp_mul_v2r8(velec,sw);
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq31,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx31,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy31,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz31,fscal,fiz3);
+            
+            fjx1             = _fjsp_madd_v2r8(dx31,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy31,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz31,fscal,fjz1);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq32,rcutoff2))
+            {
+
+            r32              = _fjsp_mul_v2r8(rsq32,rinv32);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r32,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq32,_fjsp_sub_v2r8(rinv32,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq32,rinv32),_fjsp_sub_v2r8(rinvsq32,felec));
+
+            d                = _fjsp_sub_v2r8(r32,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv32,_fjsp_mul_v2r8(velec,dsw)) );
+            velec            = _fjsp_mul_v2r8(velec,sw);
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq32,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx32,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy32,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz32,fscal,fiz3);
+            
+            fjx2             = _fjsp_madd_v2r8(dx32,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy32,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz32,fscal,fjz2);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq33,rcutoff2))
+            {
+
+            r33              = _fjsp_mul_v2r8(rsq33,rinv33);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r33,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq33,_fjsp_sub_v2r8(rinv33,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq33,rinv33),_fjsp_sub_v2r8(rinvsq33,felec));
+
+            d                = _fjsp_sub_v2r8(r33,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv33,_fjsp_mul_v2r8(velec,dsw)) );
+            velec            = _fjsp_mul_v2r8(velec,sw);
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq33,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx33,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy33,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz33,fscal,fiz3);
+            
+            fjx3             = _fjsp_madd_v2r8(dx33,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy33,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz33,fscal,fjz3);
+
+            }
+
+            gmx_fjsp_decrement_4rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+
+            /* Inner loop uses 677 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_4rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
+                                              &jy2,&jz2,&jx3,&jy3,&jz3);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx11             = _fjsp_sub_v2r8(ix1,jx1);
+            dy11             = _fjsp_sub_v2r8(iy1,jy1);
+            dz11             = _fjsp_sub_v2r8(iz1,jz1);
+            dx12             = _fjsp_sub_v2r8(ix1,jx2);
+            dy12             = _fjsp_sub_v2r8(iy1,jy2);
+            dz12             = _fjsp_sub_v2r8(iz1,jz2);
+            dx13             = _fjsp_sub_v2r8(ix1,jx3);
+            dy13             = _fjsp_sub_v2r8(iy1,jy3);
+            dz13             = _fjsp_sub_v2r8(iz1,jz3);
+            dx21             = _fjsp_sub_v2r8(ix2,jx1);
+            dy21             = _fjsp_sub_v2r8(iy2,jy1);
+            dz21             = _fjsp_sub_v2r8(iz2,jz1);
+            dx22             = _fjsp_sub_v2r8(ix2,jx2);
+            dy22             = _fjsp_sub_v2r8(iy2,jy2);
+            dz22             = _fjsp_sub_v2r8(iz2,jz2);
+            dx23             = _fjsp_sub_v2r8(ix2,jx3);
+            dy23             = _fjsp_sub_v2r8(iy2,jy3);
+            dz23             = _fjsp_sub_v2r8(iz2,jz3);
+            dx31             = _fjsp_sub_v2r8(ix3,jx1);
+            dy31             = _fjsp_sub_v2r8(iy3,jy1);
+            dz31             = _fjsp_sub_v2r8(iz3,jz1);
+            dx32             = _fjsp_sub_v2r8(ix3,jx2);
+            dy32             = _fjsp_sub_v2r8(iy3,jy2);
+            dz32             = _fjsp_sub_v2r8(iz3,jz2);
+            dx33             = _fjsp_sub_v2r8(ix3,jx3);
+            dy33             = _fjsp_sub_v2r8(iy3,jy3);
+            dz33             = _fjsp_sub_v2r8(iz3,jz3);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+            rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+            rsq13            = gmx_fjsp_calc_rsq_v2r8(dx13,dy13,dz13);
+            rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+            rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+            rsq23            = gmx_fjsp_calc_rsq_v2r8(dx23,dy23,dz23);
+            rsq31            = gmx_fjsp_calc_rsq_v2r8(dx31,dy31,dz31);
+            rsq32            = gmx_fjsp_calc_rsq_v2r8(dx32,dy32,dz32);
+            rsq33            = gmx_fjsp_calc_rsq_v2r8(dx33,dy33,dz33);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+            rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+            rinv13           = gmx_fjsp_invsqrt_v2r8(rsq13);
+            rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+            rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+            rinv23           = gmx_fjsp_invsqrt_v2r8(rsq23);
+            rinv31           = gmx_fjsp_invsqrt_v2r8(rsq31);
+            rinv32           = gmx_fjsp_invsqrt_v2r8(rsq32);
+            rinv33           = gmx_fjsp_invsqrt_v2r8(rsq33);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+            rinvsq11         = _fjsp_mul_v2r8(rinv11,rinv11);
+            rinvsq12         = _fjsp_mul_v2r8(rinv12,rinv12);
+            rinvsq13         = _fjsp_mul_v2r8(rinv13,rinv13);
+            rinvsq21         = _fjsp_mul_v2r8(rinv21,rinv21);
+            rinvsq22         = _fjsp_mul_v2r8(rinv22,rinv22);
+            rinvsq23         = _fjsp_mul_v2r8(rinv23,rinv23);
+            rinvsq31         = _fjsp_mul_v2r8(rinv31,rinv31);
+            rinvsq32         = _fjsp_mul_v2r8(rinv32,rinv32);
+            rinvsq33         = _fjsp_mul_v2r8(rinv33,rinv33);
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+            fjx1             = _fjsp_setzero_v2r8();
+            fjy1             = _fjsp_setzero_v2r8();
+            fjz1             = _fjsp_setzero_v2r8();
+            fjx2             = _fjsp_setzero_v2r8();
+            fjy2             = _fjsp_setzero_v2r8();
+            fjz2             = _fjsp_setzero_v2r8();
+            fjx3             = _fjsp_setzero_v2r8();
+            fjy3             = _fjsp_setzero_v2r8();
+            fjz3             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
+            {
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+            vvdw6            = _fjsp_mul_v2r8(c6_00,rinvsix);
+            vvdw12           = _fjsp_mul_v2r8(c12_00,_fjsp_mul_v2r8(rinvsix,rinvsix));
+            vvdw             = _fjsp_msub_v2r8( vvdw12,one_twelfth, _fjsp_mul_v2r8(vvdw6,one_sixth) );
+            fvdw             = _fjsp_mul_v2r8(_fjsp_sub_v2r8(vvdw12,vvdw6),rinvsq00);
+
+            d                = _fjsp_sub_v2r8(r00,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            fvdw             = _fjsp_msub_v2r8( fvdw,sw , _fjsp_mul_v2r8(rinv00,_fjsp_mul_v2r8(vvdw,dsw)) );
+            vvdw             = _fjsp_mul_v2r8(vvdw,sw);
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            vvdw             = _fjsp_and_v2r8(vvdw,cutoff_mask);
+            vvdw             = _fjsp_unpacklo_v2r8(vvdw,_fjsp_setzero_v2r8());
+            vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
+
+            fscal            = fvdw;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq11,rcutoff2))
+            {
+
+            r11              = _fjsp_mul_v2r8(rsq11,rinv11);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r11,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq11,_fjsp_sub_v2r8(rinv11,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq11,rinv11),_fjsp_sub_v2r8(rinvsq11,felec));
+
+            d                = _fjsp_sub_v2r8(r11,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv11,_fjsp_mul_v2r8(velec,dsw)) );
+            velec            = _fjsp_mul_v2r8(velec,sw);
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq11,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+            
+            fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq12,rcutoff2))
+            {
+
+            r12              = _fjsp_mul_v2r8(rsq12,rinv12);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r12,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq12,_fjsp_sub_v2r8(rinv12,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq12,rinv12),_fjsp_sub_v2r8(rinvsq12,felec));
+
+            d                = _fjsp_sub_v2r8(r12,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv12,_fjsp_mul_v2r8(velec,dsw)) );
+            velec            = _fjsp_mul_v2r8(velec,sw);
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq12,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+            
+            fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq13,rcutoff2))
+            {
+
+            r13              = _fjsp_mul_v2r8(rsq13,rinv13);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r13,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq13,_fjsp_sub_v2r8(rinv13,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq13,rinv13),_fjsp_sub_v2r8(rinvsq13,felec));
+
+            d                = _fjsp_sub_v2r8(r13,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv13,_fjsp_mul_v2r8(velec,dsw)) );
+            velec            = _fjsp_mul_v2r8(velec,sw);
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq13,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx13,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy13,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz13,fscal,fiz1);
+            
+            fjx3             = _fjsp_madd_v2r8(dx13,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy13,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz13,fscal,fjz3);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq21,rcutoff2))
+            {
+
+            r21              = _fjsp_mul_v2r8(rsq21,rinv21);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r21,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq21,_fjsp_sub_v2r8(rinv21,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq21,rinv21),_fjsp_sub_v2r8(rinvsq21,felec));
+
+            d                = _fjsp_sub_v2r8(r21,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv21,_fjsp_mul_v2r8(velec,dsw)) );
+            velec            = _fjsp_mul_v2r8(velec,sw);
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq21,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+            
+            fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq22,rcutoff2))
+            {
+
+            r22              = _fjsp_mul_v2r8(rsq22,rinv22);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r22,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq22,_fjsp_sub_v2r8(rinv22,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq22,rinv22),_fjsp_sub_v2r8(rinvsq22,felec));
+
+            d                = _fjsp_sub_v2r8(r22,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv22,_fjsp_mul_v2r8(velec,dsw)) );
+            velec            = _fjsp_mul_v2r8(velec,sw);
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq22,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+            
+            fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq23,rcutoff2))
+            {
+
+            r23              = _fjsp_mul_v2r8(rsq23,rinv23);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r23,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq23,_fjsp_sub_v2r8(rinv23,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq23,rinv23),_fjsp_sub_v2r8(rinvsq23,felec));
+
+            d                = _fjsp_sub_v2r8(r23,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv23,_fjsp_mul_v2r8(velec,dsw)) );
+            velec            = _fjsp_mul_v2r8(velec,sw);
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq23,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx23,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy23,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz23,fscal,fiz2);
+            
+            fjx3             = _fjsp_madd_v2r8(dx23,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy23,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz23,fscal,fjz3);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq31,rcutoff2))
+            {
+
+            r31              = _fjsp_mul_v2r8(rsq31,rinv31);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r31,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq31,_fjsp_sub_v2r8(rinv31,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq31,rinv31),_fjsp_sub_v2r8(rinvsq31,felec));
+
+            d                = _fjsp_sub_v2r8(r31,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv31,_fjsp_mul_v2r8(velec,dsw)) );
+            velec            = _fjsp_mul_v2r8(velec,sw);
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq31,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx31,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy31,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz31,fscal,fiz3);
+            
+            fjx1             = _fjsp_madd_v2r8(dx31,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy31,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz31,fscal,fjz1);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq32,rcutoff2))
+            {
+
+            r32              = _fjsp_mul_v2r8(rsq32,rinv32);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r32,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq32,_fjsp_sub_v2r8(rinv32,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq32,rinv32),_fjsp_sub_v2r8(rinvsq32,felec));
+
+            d                = _fjsp_sub_v2r8(r32,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv32,_fjsp_mul_v2r8(velec,dsw)) );
+            velec            = _fjsp_mul_v2r8(velec,sw);
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq32,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx32,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy32,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz32,fscal,fiz3);
+            
+            fjx2             = _fjsp_madd_v2r8(dx32,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy32,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz32,fscal,fjz2);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq33,rcutoff2))
+            {
+
+            r33              = _fjsp_mul_v2r8(rsq33,rinv33);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r33,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq33,_fjsp_sub_v2r8(rinv33,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq33,rinv33),_fjsp_sub_v2r8(rinvsq33,felec));
+
+            d                = _fjsp_sub_v2r8(r33,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv33,_fjsp_mul_v2r8(velec,dsw)) );
+            velec            = _fjsp_mul_v2r8(velec,sw);
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq33,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx33,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy33,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz33,fscal,fiz3);
+            
+            fjx3             = _fjsp_madd_v2r8(dx33,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy33,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz33,fscal,fjz3);
+
+            }
+
+            gmx_fjsp_decrement_4rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+
+            /* Inner loop uses 677 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_fjsp_update_iforce_4atom_swizzle_v2r8(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        ggid                        = gid[iidx];
+        /* Update potential energies */
+        gmx_fjsp_update_1pot_v2r8(velecsum,kernel_data->energygrp_elec+ggid);
+        gmx_fjsp_update_1pot_v2r8(vvdwsum,kernel_data->energygrp_vdw+ggid);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 26 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_VF,outeriter*26 + inneriter*677);
+}
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecEwSw_VdwLJSw_GeomW4W4_F_sparc64_hpc_ace_double
+ * Electrostatics interaction: Ewald
+ * VdW interaction:            LennardJones
+ * Geometry:                   Water4-Water4
+ * Calculate force/pot:        Force
+ */
+void
+nb_kernel_ElecEwSw_VdwLJSw_GeomW4W4_F_sparc64_hpc_ace_double
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwioffset3;
+    _fjsp_v2r8       ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+    int              vdwjidx0A,vdwjidx0B;
+    _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    int              vdwjidx1A,vdwjidx1B;
+    _fjsp_v2r8       jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+    int              vdwjidx2A,vdwjidx2B;
+    _fjsp_v2r8       jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+    int              vdwjidx3A,vdwjidx3B;
+    _fjsp_v2r8       jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
+    _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    _fjsp_v2r8       dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+    _fjsp_v2r8       dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+    _fjsp_v2r8       dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
+    _fjsp_v2r8       dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+    _fjsp_v2r8       dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+    _fjsp_v2r8       dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
+    _fjsp_v2r8       dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
+    _fjsp_v2r8       dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
+    _fjsp_v2r8       dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
+    _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
+    _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
+    _fjsp_v2r8       ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+    real             *ewtab;
+    _fjsp_v2r8       rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
+    real             rswitch_scalar,d_scalar;
+    _fjsp_v2r8       itab_tmp;
+    _fjsp_v2r8       dummy_mask,cutoff_mask;
+    _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+    _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+    union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+
+    sh_ewald         = gmx_fjsp_set1_v2r8(fr->ic->sh_ewald);
+    ewtab            = fr->ic->tabq_coul_FDV0;
+    ewtabscale       = gmx_fjsp_set1_v2r8(fr->ic->tabq_scale);
+    ewtabhalfspace   = gmx_fjsp_set1_v2r8(0.5/fr->ic->tabq_scale);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+    iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+    iq3              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+3]));
+    vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+    jq1              = gmx_fjsp_set1_v2r8(charge[inr+1]);
+    jq2              = gmx_fjsp_set1_v2r8(charge[inr+2]);
+    jq3              = gmx_fjsp_set1_v2r8(charge[inr+3]);
+    vdwjidx0A        = 2*vdwtype[inr+0];
+    c6_00            = gmx_fjsp_set1_v2r8(vdwparam[vdwioffset0+vdwjidx0A]);
+    c12_00           = gmx_fjsp_set1_v2r8(vdwparam[vdwioffset0+vdwjidx0A+1]);
+    qq11             = _fjsp_mul_v2r8(iq1,jq1);
+    qq12             = _fjsp_mul_v2r8(iq1,jq2);
+    qq13             = _fjsp_mul_v2r8(iq1,jq3);
+    qq21             = _fjsp_mul_v2r8(iq2,jq1);
+    qq22             = _fjsp_mul_v2r8(iq2,jq2);
+    qq23             = _fjsp_mul_v2r8(iq2,jq3);
+    qq31             = _fjsp_mul_v2r8(iq3,jq1);
+    qq32             = _fjsp_mul_v2r8(iq3,jq2);
+    qq33             = _fjsp_mul_v2r8(iq3,jq3);
+
+    /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+    rcutoff_scalar   = fr->rcoulomb;
+    rcutoff          = gmx_fjsp_set1_v2r8(rcutoff_scalar);
+    rcutoff2         = _fjsp_mul_v2r8(rcutoff,rcutoff);
+
+    rswitch_scalar   = fr->rcoulomb_switch;
+    rswitch          = gmx_fjsp_set1_v2r8(rswitch_scalar);
+    /* Setup switch parameters */
+    d_scalar         = rcutoff_scalar-rswitch_scalar;
+    d                = gmx_fjsp_set1_v2r8(d_scalar);
+    swV3             = gmx_fjsp_set1_v2r8(-10.0/(d_scalar*d_scalar*d_scalar));
+    swV4             = gmx_fjsp_set1_v2r8( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar));
+    swV5             = gmx_fjsp_set1_v2r8( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
+    swF2             = gmx_fjsp_set1_v2r8(-30.0/(d_scalar*d_scalar*d_scalar));
+    swF3             = gmx_fjsp_set1_v2r8( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar));
+    swF4             = gmx_fjsp_set1_v2r8(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_fjsp_load_shift_and_4rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
+
+        fix0             = _fjsp_setzero_v2r8();
+        fiy0             = _fjsp_setzero_v2r8();
+        fiz0             = _fjsp_setzero_v2r8();
+        fix1             = _fjsp_setzero_v2r8();
+        fiy1             = _fjsp_setzero_v2r8();
+        fiz1             = _fjsp_setzero_v2r8();
+        fix2             = _fjsp_setzero_v2r8();
+        fiy2             = _fjsp_setzero_v2r8();
+        fiz2             = _fjsp_setzero_v2r8();
+        fix3             = _fjsp_setzero_v2r8();
+        fiy3             = _fjsp_setzero_v2r8();
+        fiz3             = _fjsp_setzero_v2r8();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_4rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
+                                              &jy2,&jz2,&jx3,&jy3,&jz3);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx11             = _fjsp_sub_v2r8(ix1,jx1);
+            dy11             = _fjsp_sub_v2r8(iy1,jy1);
+            dz11             = _fjsp_sub_v2r8(iz1,jz1);
+            dx12             = _fjsp_sub_v2r8(ix1,jx2);
+            dy12             = _fjsp_sub_v2r8(iy1,jy2);
+            dz12             = _fjsp_sub_v2r8(iz1,jz2);
+            dx13             = _fjsp_sub_v2r8(ix1,jx3);
+            dy13             = _fjsp_sub_v2r8(iy1,jy3);
+            dz13             = _fjsp_sub_v2r8(iz1,jz3);
+            dx21             = _fjsp_sub_v2r8(ix2,jx1);
+            dy21             = _fjsp_sub_v2r8(iy2,jy1);
+            dz21             = _fjsp_sub_v2r8(iz2,jz1);
+            dx22             = _fjsp_sub_v2r8(ix2,jx2);
+            dy22             = _fjsp_sub_v2r8(iy2,jy2);
+            dz22             = _fjsp_sub_v2r8(iz2,jz2);
+            dx23             = _fjsp_sub_v2r8(ix2,jx3);
+            dy23             = _fjsp_sub_v2r8(iy2,jy3);
+            dz23             = _fjsp_sub_v2r8(iz2,jz3);
+            dx31             = _fjsp_sub_v2r8(ix3,jx1);
+            dy31             = _fjsp_sub_v2r8(iy3,jy1);
+            dz31             = _fjsp_sub_v2r8(iz3,jz1);
+            dx32             = _fjsp_sub_v2r8(ix3,jx2);
+            dy32             = _fjsp_sub_v2r8(iy3,jy2);
+            dz32             = _fjsp_sub_v2r8(iz3,jz2);
+            dx33             = _fjsp_sub_v2r8(ix3,jx3);
+            dy33             = _fjsp_sub_v2r8(iy3,jy3);
+            dz33             = _fjsp_sub_v2r8(iz3,jz3);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+            rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+            rsq13            = gmx_fjsp_calc_rsq_v2r8(dx13,dy13,dz13);
+            rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+            rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+            rsq23            = gmx_fjsp_calc_rsq_v2r8(dx23,dy23,dz23);
+            rsq31            = gmx_fjsp_calc_rsq_v2r8(dx31,dy31,dz31);
+            rsq32            = gmx_fjsp_calc_rsq_v2r8(dx32,dy32,dz32);
+            rsq33            = gmx_fjsp_calc_rsq_v2r8(dx33,dy33,dz33);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+            rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+            rinv13           = gmx_fjsp_invsqrt_v2r8(rsq13);
+            rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+            rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+            rinv23           = gmx_fjsp_invsqrt_v2r8(rsq23);
+            rinv31           = gmx_fjsp_invsqrt_v2r8(rsq31);
+            rinv32           = gmx_fjsp_invsqrt_v2r8(rsq32);
+            rinv33           = gmx_fjsp_invsqrt_v2r8(rsq33);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+            rinvsq11         = _fjsp_mul_v2r8(rinv11,rinv11);
+            rinvsq12         = _fjsp_mul_v2r8(rinv12,rinv12);
+            rinvsq13         = _fjsp_mul_v2r8(rinv13,rinv13);
+            rinvsq21         = _fjsp_mul_v2r8(rinv21,rinv21);
+            rinvsq22         = _fjsp_mul_v2r8(rinv22,rinv22);
+            rinvsq23         = _fjsp_mul_v2r8(rinv23,rinv23);
+            rinvsq31         = _fjsp_mul_v2r8(rinv31,rinv31);
+            rinvsq32         = _fjsp_mul_v2r8(rinv32,rinv32);
+            rinvsq33         = _fjsp_mul_v2r8(rinv33,rinv33);
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+            fjx1             = _fjsp_setzero_v2r8();
+            fjy1             = _fjsp_setzero_v2r8();
+            fjz1             = _fjsp_setzero_v2r8();
+            fjx2             = _fjsp_setzero_v2r8();
+            fjy2             = _fjsp_setzero_v2r8();
+            fjz2             = _fjsp_setzero_v2r8();
+            fjx3             = _fjsp_setzero_v2r8();
+            fjy3             = _fjsp_setzero_v2r8();
+            fjz3             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
+            {
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+            vvdw6            = _fjsp_mul_v2r8(c6_00,rinvsix);
+            vvdw12           = _fjsp_mul_v2r8(c12_00,_fjsp_mul_v2r8(rinvsix,rinvsix));
+            vvdw             = _fjsp_msub_v2r8( vvdw12,one_twelfth, _fjsp_mul_v2r8(vvdw6,one_sixth) );
+            fvdw             = _fjsp_mul_v2r8(_fjsp_sub_v2r8(vvdw12,vvdw6),rinvsq00);
+
+            d                = _fjsp_sub_v2r8(r00,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            fvdw             = _fjsp_msub_v2r8( fvdw,sw , _fjsp_mul_v2r8(rinv00,_fjsp_mul_v2r8(vvdw,dsw)) );
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
+
+            fscal            = fvdw;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq11,rcutoff2))
+            {
+
+            r11              = _fjsp_mul_v2r8(rsq11,rinv11);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r11,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq11,_fjsp_sub_v2r8(rinv11,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq11,rinv11),_fjsp_sub_v2r8(rinvsq11,felec));
+
+            d                = _fjsp_sub_v2r8(r11,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv11,_fjsp_mul_v2r8(velec,dsw)) );
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq11,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+            
+            fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq12,rcutoff2))
+            {
+
+            r12              = _fjsp_mul_v2r8(rsq12,rinv12);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r12,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq12,_fjsp_sub_v2r8(rinv12,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq12,rinv12),_fjsp_sub_v2r8(rinvsq12,felec));
+
+            d                = _fjsp_sub_v2r8(r12,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv12,_fjsp_mul_v2r8(velec,dsw)) );
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq12,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+            
+            fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq13,rcutoff2))
+            {
+
+            r13              = _fjsp_mul_v2r8(rsq13,rinv13);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r13,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq13,_fjsp_sub_v2r8(rinv13,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq13,rinv13),_fjsp_sub_v2r8(rinvsq13,felec));
+
+            d                = _fjsp_sub_v2r8(r13,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv13,_fjsp_mul_v2r8(velec,dsw)) );
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq13,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx13,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy13,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz13,fscal,fiz1);
+            
+            fjx3             = _fjsp_madd_v2r8(dx13,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy13,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz13,fscal,fjz3);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq21,rcutoff2))
+            {
+
+            r21              = _fjsp_mul_v2r8(rsq21,rinv21);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r21,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq21,_fjsp_sub_v2r8(rinv21,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq21,rinv21),_fjsp_sub_v2r8(rinvsq21,felec));
+
+            d                = _fjsp_sub_v2r8(r21,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv21,_fjsp_mul_v2r8(velec,dsw)) );
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq21,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+            
+            fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq22,rcutoff2))
+            {
+
+            r22              = _fjsp_mul_v2r8(rsq22,rinv22);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r22,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq22,_fjsp_sub_v2r8(rinv22,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq22,rinv22),_fjsp_sub_v2r8(rinvsq22,felec));
+
+            d                = _fjsp_sub_v2r8(r22,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv22,_fjsp_mul_v2r8(velec,dsw)) );
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq22,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+            
+            fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq23,rcutoff2))
+            {
+
+            r23              = _fjsp_mul_v2r8(rsq23,rinv23);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r23,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq23,_fjsp_sub_v2r8(rinv23,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq23,rinv23),_fjsp_sub_v2r8(rinvsq23,felec));
+
+            d                = _fjsp_sub_v2r8(r23,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv23,_fjsp_mul_v2r8(velec,dsw)) );
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq23,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx23,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy23,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz23,fscal,fiz2);
+            
+            fjx3             = _fjsp_madd_v2r8(dx23,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy23,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz23,fscal,fjz3);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq31,rcutoff2))
+            {
+
+            r31              = _fjsp_mul_v2r8(rsq31,rinv31);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r31,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq31,_fjsp_sub_v2r8(rinv31,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq31,rinv31),_fjsp_sub_v2r8(rinvsq31,felec));
+
+            d                = _fjsp_sub_v2r8(r31,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv31,_fjsp_mul_v2r8(velec,dsw)) );
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq31,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx31,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy31,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz31,fscal,fiz3);
+            
+            fjx1             = _fjsp_madd_v2r8(dx31,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy31,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz31,fscal,fjz1);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq32,rcutoff2))
+            {
+
+            r32              = _fjsp_mul_v2r8(rsq32,rinv32);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r32,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq32,_fjsp_sub_v2r8(rinv32,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq32,rinv32),_fjsp_sub_v2r8(rinvsq32,felec));
+
+            d                = _fjsp_sub_v2r8(r32,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv32,_fjsp_mul_v2r8(velec,dsw)) );
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq32,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx32,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy32,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz32,fscal,fiz3);
+            
+            fjx2             = _fjsp_madd_v2r8(dx32,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy32,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz32,fscal,fjz2);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq33,rcutoff2))
+            {
+
+            r33              = _fjsp_mul_v2r8(rsq33,rinv33);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r33,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq33,_fjsp_sub_v2r8(rinv33,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq33,rinv33),_fjsp_sub_v2r8(rinvsq33,felec));
+
+            d                = _fjsp_sub_v2r8(r33,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv33,_fjsp_mul_v2r8(velec,dsw)) );
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq33,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx33,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy33,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz33,fscal,fiz3);
+            
+            fjx3             = _fjsp_madd_v2r8(dx33,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy33,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz33,fscal,fjz3);
+
+            }
+
+            gmx_fjsp_decrement_4rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+
+            /* Inner loop uses 647 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_4rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
+                                              &jy2,&jz2,&jx3,&jy3,&jz3);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx11             = _fjsp_sub_v2r8(ix1,jx1);
+            dy11             = _fjsp_sub_v2r8(iy1,jy1);
+            dz11             = _fjsp_sub_v2r8(iz1,jz1);
+            dx12             = _fjsp_sub_v2r8(ix1,jx2);
+            dy12             = _fjsp_sub_v2r8(iy1,jy2);
+            dz12             = _fjsp_sub_v2r8(iz1,jz2);
+            dx13             = _fjsp_sub_v2r8(ix1,jx3);
+            dy13             = _fjsp_sub_v2r8(iy1,jy3);
+            dz13             = _fjsp_sub_v2r8(iz1,jz3);
+            dx21             = _fjsp_sub_v2r8(ix2,jx1);
+            dy21             = _fjsp_sub_v2r8(iy2,jy1);
+            dz21             = _fjsp_sub_v2r8(iz2,jz1);
+            dx22             = _fjsp_sub_v2r8(ix2,jx2);
+            dy22             = _fjsp_sub_v2r8(iy2,jy2);
+            dz22             = _fjsp_sub_v2r8(iz2,jz2);
+            dx23             = _fjsp_sub_v2r8(ix2,jx3);
+            dy23             = _fjsp_sub_v2r8(iy2,jy3);
+            dz23             = _fjsp_sub_v2r8(iz2,jz3);
+            dx31             = _fjsp_sub_v2r8(ix3,jx1);
+            dy31             = _fjsp_sub_v2r8(iy3,jy1);
+            dz31             = _fjsp_sub_v2r8(iz3,jz1);
+            dx32             = _fjsp_sub_v2r8(ix3,jx2);
+            dy32             = _fjsp_sub_v2r8(iy3,jy2);
+            dz32             = _fjsp_sub_v2r8(iz3,jz2);
+            dx33             = _fjsp_sub_v2r8(ix3,jx3);
+            dy33             = _fjsp_sub_v2r8(iy3,jy3);
+            dz33             = _fjsp_sub_v2r8(iz3,jz3);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+            rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+            rsq13            = gmx_fjsp_calc_rsq_v2r8(dx13,dy13,dz13);
+            rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+            rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+            rsq23            = gmx_fjsp_calc_rsq_v2r8(dx23,dy23,dz23);
+            rsq31            = gmx_fjsp_calc_rsq_v2r8(dx31,dy31,dz31);
+            rsq32            = gmx_fjsp_calc_rsq_v2r8(dx32,dy32,dz32);
+            rsq33            = gmx_fjsp_calc_rsq_v2r8(dx33,dy33,dz33);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+            rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+            rinv13           = gmx_fjsp_invsqrt_v2r8(rsq13);
+            rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+            rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+            rinv23           = gmx_fjsp_invsqrt_v2r8(rsq23);
+            rinv31           = gmx_fjsp_invsqrt_v2r8(rsq31);
+            rinv32           = gmx_fjsp_invsqrt_v2r8(rsq32);
+            rinv33           = gmx_fjsp_invsqrt_v2r8(rsq33);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+            rinvsq11         = _fjsp_mul_v2r8(rinv11,rinv11);
+            rinvsq12         = _fjsp_mul_v2r8(rinv12,rinv12);
+            rinvsq13         = _fjsp_mul_v2r8(rinv13,rinv13);
+            rinvsq21         = _fjsp_mul_v2r8(rinv21,rinv21);
+            rinvsq22         = _fjsp_mul_v2r8(rinv22,rinv22);
+            rinvsq23         = _fjsp_mul_v2r8(rinv23,rinv23);
+            rinvsq31         = _fjsp_mul_v2r8(rinv31,rinv31);
+            rinvsq32         = _fjsp_mul_v2r8(rinv32,rinv32);
+            rinvsq33         = _fjsp_mul_v2r8(rinv33,rinv33);
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+            fjx1             = _fjsp_setzero_v2r8();
+            fjy1             = _fjsp_setzero_v2r8();
+            fjz1             = _fjsp_setzero_v2r8();
+            fjx2             = _fjsp_setzero_v2r8();
+            fjy2             = _fjsp_setzero_v2r8();
+            fjz2             = _fjsp_setzero_v2r8();
+            fjx3             = _fjsp_setzero_v2r8();
+            fjy3             = _fjsp_setzero_v2r8();
+            fjz3             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
+            {
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+            vvdw6            = _fjsp_mul_v2r8(c6_00,rinvsix);
+            vvdw12           = _fjsp_mul_v2r8(c12_00,_fjsp_mul_v2r8(rinvsix,rinvsix));
+            vvdw             = _fjsp_msub_v2r8( vvdw12,one_twelfth, _fjsp_mul_v2r8(vvdw6,one_sixth) );
+            fvdw             = _fjsp_mul_v2r8(_fjsp_sub_v2r8(vvdw12,vvdw6),rinvsq00);
+
+            d                = _fjsp_sub_v2r8(r00,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            fvdw             = _fjsp_msub_v2r8( fvdw,sw , _fjsp_mul_v2r8(rinv00,_fjsp_mul_v2r8(vvdw,dsw)) );
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
+
+            fscal            = fvdw;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq11,rcutoff2))
+            {
+
+            r11              = _fjsp_mul_v2r8(rsq11,rinv11);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r11,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq11,_fjsp_sub_v2r8(rinv11,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq11,rinv11),_fjsp_sub_v2r8(rinvsq11,felec));
+
+            d                = _fjsp_sub_v2r8(r11,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv11,_fjsp_mul_v2r8(velec,dsw)) );
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq11,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+            
+            fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq12,rcutoff2))
+            {
+
+            r12              = _fjsp_mul_v2r8(rsq12,rinv12);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r12,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq12,_fjsp_sub_v2r8(rinv12,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq12,rinv12),_fjsp_sub_v2r8(rinvsq12,felec));
+
+            d                = _fjsp_sub_v2r8(r12,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv12,_fjsp_mul_v2r8(velec,dsw)) );
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq12,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+            
+            fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq13,rcutoff2))
+            {
+
+            r13              = _fjsp_mul_v2r8(rsq13,rinv13);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r13,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq13,_fjsp_sub_v2r8(rinv13,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq13,rinv13),_fjsp_sub_v2r8(rinvsq13,felec));
+
+            d                = _fjsp_sub_v2r8(r13,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv13,_fjsp_mul_v2r8(velec,dsw)) );
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq13,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx13,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy13,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz13,fscal,fiz1);
+            
+            fjx3             = _fjsp_madd_v2r8(dx13,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy13,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz13,fscal,fjz3);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq21,rcutoff2))
+            {
+
+            r21              = _fjsp_mul_v2r8(rsq21,rinv21);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r21,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq21,_fjsp_sub_v2r8(rinv21,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq21,rinv21),_fjsp_sub_v2r8(rinvsq21,felec));
+
+            d                = _fjsp_sub_v2r8(r21,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv21,_fjsp_mul_v2r8(velec,dsw)) );
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq21,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+            
+            fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq22,rcutoff2))
+            {
+
+            r22              = _fjsp_mul_v2r8(rsq22,rinv22);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r22,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq22,_fjsp_sub_v2r8(rinv22,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq22,rinv22),_fjsp_sub_v2r8(rinvsq22,felec));
+
+            d                = _fjsp_sub_v2r8(r22,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv22,_fjsp_mul_v2r8(velec,dsw)) );
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq22,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+            
+            fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq23,rcutoff2))
+            {
+
+            r23              = _fjsp_mul_v2r8(rsq23,rinv23);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r23,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq23,_fjsp_sub_v2r8(rinv23,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq23,rinv23),_fjsp_sub_v2r8(rinvsq23,felec));
+
+            d                = _fjsp_sub_v2r8(r23,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv23,_fjsp_mul_v2r8(velec,dsw)) );
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq23,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx23,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy23,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz23,fscal,fiz2);
+            
+            fjx3             = _fjsp_madd_v2r8(dx23,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy23,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz23,fscal,fjz3);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq31,rcutoff2))
+            {
+
+            r31              = _fjsp_mul_v2r8(rsq31,rinv31);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r31,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq31,_fjsp_sub_v2r8(rinv31,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq31,rinv31),_fjsp_sub_v2r8(rinvsq31,felec));
+
+            d                = _fjsp_sub_v2r8(r31,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv31,_fjsp_mul_v2r8(velec,dsw)) );
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq31,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx31,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy31,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz31,fscal,fiz3);
+            
+            fjx1             = _fjsp_madd_v2r8(dx31,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy31,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz31,fscal,fjz1);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq32,rcutoff2))
+            {
+
+            r32              = _fjsp_mul_v2r8(rsq32,rinv32);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r32,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq32,_fjsp_sub_v2r8(rinv32,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq32,rinv32),_fjsp_sub_v2r8(rinvsq32,felec));
+
+            d                = _fjsp_sub_v2r8(r32,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv32,_fjsp_mul_v2r8(velec,dsw)) );
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq32,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx32,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy32,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz32,fscal,fiz3);
+            
+            fjx2             = _fjsp_madd_v2r8(dx32,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy32,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz32,fscal,fjz2);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq33,rcutoff2))
+            {
+
+            r33              = _fjsp_mul_v2r8(rsq33,rinv33);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r33,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq33,_fjsp_sub_v2r8(rinv33,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq33,rinv33),_fjsp_sub_v2r8(rinvsq33,felec));
+
+            d                = _fjsp_sub_v2r8(r33,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv33,_fjsp_mul_v2r8(velec,dsw)) );
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq33,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx33,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy33,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz33,fscal,fiz3);
+            
+            fjx3             = _fjsp_madd_v2r8(dx33,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy33,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz33,fscal,fjz3);
+
+            }
+
+            gmx_fjsp_decrement_4rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+
+            /* Inner loop uses 647 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_fjsp_update_iforce_4atom_swizzle_v2r8(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 24 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_F,outeriter*24 + inneriter*647);
+}
diff --git a/src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecEwSw_VdwNone_GeomP1P1_sparc64_hpc_ace_double.c b/src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecEwSw_VdwNone_GeomP1P1_sparc64_hpc_ace_double.c
new file mode 100644 (file)
index 0000000..898dec9
--- /dev/null
@@ -0,0 +1,682 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2012, by the GROMACS development team, led by
+ * David van der Spoel, Berk Hess, Erik Lindahl, and including many
+ * others, as listed in the AUTHORS file in the top-level source
+ * directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*
+ * Note: this file was generated by the GROMACS sparc64_hpc_ace_double kernel generator.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "kernelutil_sparc64_hpc_ace_double.h"
+
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecEwSw_VdwNone_GeomP1P1_VF_sparc64_hpc_ace_double
+ * Electrostatics interaction: Ewald
+ * VdW interaction:            None
+ * Geometry:                   Particle-Particle
+ * Calculate force/pot:        PotentialAndForce
+ */
+void
+nb_kernel_ElecEwSw_VdwNone_GeomP1P1_VF_sparc64_hpc_ace_double
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwjidx0A,vdwjidx0B;
+    _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    _fjsp_v2r8       ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+    real             *ewtab;
+    _fjsp_v2r8       rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
+    real             rswitch_scalar,d_scalar;
+    _fjsp_v2r8       itab_tmp;
+    _fjsp_v2r8       dummy_mask,cutoff_mask;
+    _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+    _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+    union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+    charge           = mdatoms->chargeA;
+
+    sh_ewald         = gmx_fjsp_set1_v2r8(fr->ic->sh_ewald);
+    ewtab            = fr->ic->tabq_coul_FDV0;
+    ewtabscale       = gmx_fjsp_set1_v2r8(fr->ic->tabq_scale);
+    ewtabhalfspace   = gmx_fjsp_set1_v2r8(0.5/fr->ic->tabq_scale);
+
+    /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+    rcutoff_scalar   = fr->rcoulomb;
+    rcutoff          = gmx_fjsp_set1_v2r8(rcutoff_scalar);
+    rcutoff2         = _fjsp_mul_v2r8(rcutoff,rcutoff);
+
+    rswitch_scalar   = fr->rcoulomb_switch;
+    rswitch          = gmx_fjsp_set1_v2r8(rswitch_scalar);
+    /* Setup switch parameters */
+    d_scalar         = rcutoff_scalar-rswitch_scalar;
+    d                = gmx_fjsp_set1_v2r8(d_scalar);
+    swV3             = gmx_fjsp_set1_v2r8(-10.0/(d_scalar*d_scalar*d_scalar));
+    swV4             = gmx_fjsp_set1_v2r8( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar));
+    swV5             = gmx_fjsp_set1_v2r8( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
+    swF2             = gmx_fjsp_set1_v2r8(-30.0/(d_scalar*d_scalar*d_scalar));
+    swF3             = gmx_fjsp_set1_v2r8( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar));
+    swF4             = gmx_fjsp_set1_v2r8(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_fjsp_load_shift_and_1rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,&ix0,&iy0,&iz0);
+
+        fix0             = _fjsp_setzero_v2r8();
+        fiy0             = _fjsp_setzero_v2r8();
+        fiz0             = _fjsp_setzero_v2r8();
+
+        /* Load parameters for i particles */
+        iq0              = _fjsp_mul_v2r8(facel,gmx_fjsp_load1_v2r8(charge+inr+0));
+
+        /* Reset potential sums */
+        velecsum         = _fjsp_setzero_v2r8();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_fjsp_load_2real_swizzle_v2r8(charge+jnrA+0,charge+jnrB+0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
+            {
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _fjsp_mul_v2r8(iq0,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r00,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq00,_fjsp_sub_v2r8(rinv00,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq00,rinv00),_fjsp_sub_v2r8(rinvsq00,felec));
+
+            d                = _fjsp_sub_v2r8(r00,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv00,_fjsp_mul_v2r8(velec,dsw)) );
+            velec            = _fjsp_mul_v2r8(velec,sw);
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            gmx_fjsp_decrement_fma_1rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fscal,dx00,dy00,dz00);
+
+            }
+
+            /* Inner loop uses 68 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+
+            /* Load parameters for j particles */
+            jq0              = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),charge+jnrA+0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
+            {
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _fjsp_mul_v2r8(iq0,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r00,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq00,_fjsp_sub_v2r8(rinv00,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq00,rinv00),_fjsp_sub_v2r8(rinvsq00,felec));
+
+            d                = _fjsp_sub_v2r8(r00,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv00,_fjsp_mul_v2r8(velec,dsw)) );
+            velec            = _fjsp_mul_v2r8(velec,sw);
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            gmx_fjsp_decrement_fma_1rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fscal,dx00,dy00,dz00);
+
+            }
+
+            /* Inner loop uses 68 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_fjsp_update_iforce_1atom_swizzle_v2r8(fix0,fiy0,fiz0,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        ggid                        = gid[iidx];
+        /* Update potential energies */
+        gmx_fjsp_update_1pot_v2r8(velecsum,kernel_data->energygrp_elec+ggid);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 8 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VF,outeriter*8 + inneriter*68);
+}
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecEwSw_VdwNone_GeomP1P1_F_sparc64_hpc_ace_double
+ * Electrostatics interaction: Ewald
+ * VdW interaction:            None
+ * Geometry:                   Particle-Particle
+ * Calculate force/pot:        Force
+ */
+void
+nb_kernel_ElecEwSw_VdwNone_GeomP1P1_F_sparc64_hpc_ace_double
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwjidx0A,vdwjidx0B;
+    _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    _fjsp_v2r8       ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+    real             *ewtab;
+    _fjsp_v2r8       rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
+    real             rswitch_scalar,d_scalar;
+    _fjsp_v2r8       itab_tmp;
+    _fjsp_v2r8       dummy_mask,cutoff_mask;
+    _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+    _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+    union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+    charge           = mdatoms->chargeA;
+
+    sh_ewald         = gmx_fjsp_set1_v2r8(fr->ic->sh_ewald);
+    ewtab            = fr->ic->tabq_coul_FDV0;
+    ewtabscale       = gmx_fjsp_set1_v2r8(fr->ic->tabq_scale);
+    ewtabhalfspace   = gmx_fjsp_set1_v2r8(0.5/fr->ic->tabq_scale);
+
+    /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+    rcutoff_scalar   = fr->rcoulomb;
+    rcutoff          = gmx_fjsp_set1_v2r8(rcutoff_scalar);
+    rcutoff2         = _fjsp_mul_v2r8(rcutoff,rcutoff);
+
+    rswitch_scalar   = fr->rcoulomb_switch;
+    rswitch          = gmx_fjsp_set1_v2r8(rswitch_scalar);
+    /* Setup switch parameters */
+    d_scalar         = rcutoff_scalar-rswitch_scalar;
+    d                = gmx_fjsp_set1_v2r8(d_scalar);
+    swV3             = gmx_fjsp_set1_v2r8(-10.0/(d_scalar*d_scalar*d_scalar));
+    swV4             = gmx_fjsp_set1_v2r8( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar));
+    swV5             = gmx_fjsp_set1_v2r8( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
+    swF2             = gmx_fjsp_set1_v2r8(-30.0/(d_scalar*d_scalar*d_scalar));
+    swF3             = gmx_fjsp_set1_v2r8( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar));
+    swF4             = gmx_fjsp_set1_v2r8(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_fjsp_load_shift_and_1rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,&ix0,&iy0,&iz0);
+
+        fix0             = _fjsp_setzero_v2r8();
+        fiy0             = _fjsp_setzero_v2r8();
+        fiz0             = _fjsp_setzero_v2r8();
+
+        /* Load parameters for i particles */
+        iq0              = _fjsp_mul_v2r8(facel,gmx_fjsp_load1_v2r8(charge+inr+0));
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_fjsp_load_2real_swizzle_v2r8(charge+jnrA+0,charge+jnrB+0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
+            {
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _fjsp_mul_v2r8(iq0,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r00,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq00,_fjsp_sub_v2r8(rinv00,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq00,rinv00),_fjsp_sub_v2r8(rinvsq00,felec));
+
+            d                = _fjsp_sub_v2r8(r00,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv00,_fjsp_mul_v2r8(velec,dsw)) );
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            gmx_fjsp_decrement_fma_1rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fscal,dx00,dy00,dz00);
+
+            }
+
+            /* Inner loop uses 65 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+
+            /* Load parameters for j particles */
+            jq0              = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),charge+jnrA+0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
+            {
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _fjsp_mul_v2r8(iq0,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r00,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq00,_fjsp_sub_v2r8(rinv00,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq00,rinv00),_fjsp_sub_v2r8(rinvsq00,felec));
+
+            d                = _fjsp_sub_v2r8(r00,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv00,_fjsp_mul_v2r8(velec,dsw)) );
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            gmx_fjsp_decrement_fma_1rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fscal,dx00,dy00,dz00);
+
+            }
+
+            /* Inner loop uses 65 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_fjsp_update_iforce_1atom_swizzle_v2r8(fix0,fiy0,fiz0,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 7 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_F,outeriter*7 + inneriter*65);
+}
diff --git a/src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecEwSw_VdwNone_GeomW3P1_sparc64_hpc_ace_double.c b/src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecEwSw_VdwNone_GeomW3P1_sparc64_hpc_ace_double.c
new file mode 100644 (file)
index 0000000..e14474f
--- /dev/null
@@ -0,0 +1,1288 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2012, by the GROMACS development team, led by
+ * David van der Spoel, Berk Hess, Erik Lindahl, and including many
+ * others, as listed in the AUTHORS file in the top-level source
+ * directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*
+ * Note: this file was generated by the GROMACS sparc64_hpc_ace_double kernel generator.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "kernelutil_sparc64_hpc_ace_double.h"
+
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecEwSw_VdwNone_GeomW3P1_VF_sparc64_hpc_ace_double
+ * Electrostatics interaction: Ewald
+ * VdW interaction:            None
+ * Geometry:                   Water3-Particle
+ * Calculate force/pot:        PotentialAndForce
+ */
+void
+nb_kernel_ElecEwSw_VdwNone_GeomW3P1_VF_sparc64_hpc_ace_double
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwjidx0A,vdwjidx0B;
+    _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    _fjsp_v2r8       dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    _fjsp_v2r8       dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    _fjsp_v2r8       ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+    real             *ewtab;
+    _fjsp_v2r8       rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
+    real             rswitch_scalar,d_scalar;
+    _fjsp_v2r8       itab_tmp;
+    _fjsp_v2r8       dummy_mask,cutoff_mask;
+    _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+    _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+    union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+    charge           = mdatoms->chargeA;
+
+    sh_ewald         = gmx_fjsp_set1_v2r8(fr->ic->sh_ewald);
+    ewtab            = fr->ic->tabq_coul_FDV0;
+    ewtabscale       = gmx_fjsp_set1_v2r8(fr->ic->tabq_scale);
+    ewtabhalfspace   = gmx_fjsp_set1_v2r8(0.5/fr->ic->tabq_scale);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq0              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+0]));
+    iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+    iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+
+    /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+    rcutoff_scalar   = fr->rcoulomb;
+    rcutoff          = gmx_fjsp_set1_v2r8(rcutoff_scalar);
+    rcutoff2         = _fjsp_mul_v2r8(rcutoff,rcutoff);
+
+    rswitch_scalar   = fr->rcoulomb_switch;
+    rswitch          = gmx_fjsp_set1_v2r8(rswitch_scalar);
+    /* Setup switch parameters */
+    d_scalar         = rcutoff_scalar-rswitch_scalar;
+    d                = gmx_fjsp_set1_v2r8(d_scalar);
+    swV3             = gmx_fjsp_set1_v2r8(-10.0/(d_scalar*d_scalar*d_scalar));
+    swV4             = gmx_fjsp_set1_v2r8( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar));
+    swV5             = gmx_fjsp_set1_v2r8( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
+    swF2             = gmx_fjsp_set1_v2r8(-30.0/(d_scalar*d_scalar*d_scalar));
+    swF3             = gmx_fjsp_set1_v2r8( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar));
+    swF4             = gmx_fjsp_set1_v2r8(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_fjsp_load_shift_and_3rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
+
+        fix0             = _fjsp_setzero_v2r8();
+        fiy0             = _fjsp_setzero_v2r8();
+        fiz0             = _fjsp_setzero_v2r8();
+        fix1             = _fjsp_setzero_v2r8();
+        fiy1             = _fjsp_setzero_v2r8();
+        fiz1             = _fjsp_setzero_v2r8();
+        fix2             = _fjsp_setzero_v2r8();
+        fiy2             = _fjsp_setzero_v2r8();
+        fiz2             = _fjsp_setzero_v2r8();
+
+        /* Reset potential sums */
+        velecsum         = _fjsp_setzero_v2r8();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+            rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+            rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_fjsp_load_2real_swizzle_v2r8(charge+jnrA+0,charge+jnrB+0);
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
+            {
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _fjsp_mul_v2r8(iq0,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r00,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq00,_fjsp_sub_v2r8(rinv00,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq00,rinv00),_fjsp_sub_v2r8(rinvsq00,felec));
+
+            d                = _fjsp_sub_v2r8(r00,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv00,_fjsp_mul_v2r8(velec,dsw)) );
+            velec            = _fjsp_mul_v2r8(velec,sw);
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq10,rcutoff2))
+            {
+
+            r10              = _fjsp_mul_v2r8(rsq10,rinv10);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _fjsp_mul_v2r8(iq1,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r10,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq10,_fjsp_sub_v2r8(rinv10,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq10,rinv10),_fjsp_sub_v2r8(rinvsq10,felec));
+
+            d                = _fjsp_sub_v2r8(r10,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv10,_fjsp_mul_v2r8(velec,dsw)) );
+            velec            = _fjsp_mul_v2r8(velec,sw);
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq10,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq20,rcutoff2))
+            {
+
+            r20              = _fjsp_mul_v2r8(rsq20,rinv20);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _fjsp_mul_v2r8(iq2,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r20,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq20,_fjsp_sub_v2r8(rinv20,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq20,rinv20),_fjsp_sub_v2r8(rinvsq20,felec));
+
+            d                = _fjsp_sub_v2r8(r20,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv20,_fjsp_mul_v2r8(velec,dsw)) );
+            velec            = _fjsp_mul_v2r8(velec,sw);
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq20,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            }
+
+            gmx_fjsp_decrement_1rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 207 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+            rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+            rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+
+            /* Load parameters for j particles */
+            jq0              = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),charge+jnrA+0);
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
+            {
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _fjsp_mul_v2r8(iq0,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r00,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq00,_fjsp_sub_v2r8(rinv00,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq00,rinv00),_fjsp_sub_v2r8(rinvsq00,felec));
+
+            d                = _fjsp_sub_v2r8(r00,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv00,_fjsp_mul_v2r8(velec,dsw)) );
+            velec            = _fjsp_mul_v2r8(velec,sw);
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq10,rcutoff2))
+            {
+
+            r10              = _fjsp_mul_v2r8(rsq10,rinv10);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _fjsp_mul_v2r8(iq1,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r10,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq10,_fjsp_sub_v2r8(rinv10,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq10,rinv10),_fjsp_sub_v2r8(rinvsq10,felec));
+
+            d                = _fjsp_sub_v2r8(r10,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv10,_fjsp_mul_v2r8(velec,dsw)) );
+            velec            = _fjsp_mul_v2r8(velec,sw);
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq10,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq20,rcutoff2))
+            {
+
+            r20              = _fjsp_mul_v2r8(rsq20,rinv20);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _fjsp_mul_v2r8(iq2,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r20,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq20,_fjsp_sub_v2r8(rinv20,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq20,rinv20),_fjsp_sub_v2r8(rinvsq20,felec));
+
+            d                = _fjsp_sub_v2r8(r20,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv20,_fjsp_mul_v2r8(velec,dsw)) );
+            velec            = _fjsp_mul_v2r8(velec,sw);
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq20,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            }
+
+            gmx_fjsp_decrement_1rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 207 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_fjsp_update_iforce_3atom_swizzle_v2r8(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        ggid                        = gid[iidx];
+        /* Update potential energies */
+        gmx_fjsp_update_1pot_v2r8(velecsum,kernel_data->energygrp_elec+ggid);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 19 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W3_VF,outeriter*19 + inneriter*207);
+}
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecEwSw_VdwNone_GeomW3P1_F_sparc64_hpc_ace_double
+ * Electrostatics interaction: Ewald
+ * VdW interaction:            None
+ * Geometry:                   Water3-Particle
+ * Calculate force/pot:        Force
+ */
+void
+nb_kernel_ElecEwSw_VdwNone_GeomW3P1_F_sparc64_hpc_ace_double
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwjidx0A,vdwjidx0B;
+    _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    _fjsp_v2r8       dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    _fjsp_v2r8       dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    _fjsp_v2r8       ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+    real             *ewtab;
+    _fjsp_v2r8       rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
+    real             rswitch_scalar,d_scalar;
+    _fjsp_v2r8       itab_tmp;
+    _fjsp_v2r8       dummy_mask,cutoff_mask;
+    _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+    _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+    union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+    charge           = mdatoms->chargeA;
+
+    sh_ewald         = gmx_fjsp_set1_v2r8(fr->ic->sh_ewald);
+    ewtab            = fr->ic->tabq_coul_FDV0;
+    ewtabscale       = gmx_fjsp_set1_v2r8(fr->ic->tabq_scale);
+    ewtabhalfspace   = gmx_fjsp_set1_v2r8(0.5/fr->ic->tabq_scale);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq0              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+0]));
+    iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+    iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+
+    /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+    rcutoff_scalar   = fr->rcoulomb;
+    rcutoff          = gmx_fjsp_set1_v2r8(rcutoff_scalar);
+    rcutoff2         = _fjsp_mul_v2r8(rcutoff,rcutoff);
+
+    rswitch_scalar   = fr->rcoulomb_switch;
+    rswitch          = gmx_fjsp_set1_v2r8(rswitch_scalar);
+    /* Setup switch parameters */
+    d_scalar         = rcutoff_scalar-rswitch_scalar;
+    d                = gmx_fjsp_set1_v2r8(d_scalar);
+    swV3             = gmx_fjsp_set1_v2r8(-10.0/(d_scalar*d_scalar*d_scalar));
+    swV4             = gmx_fjsp_set1_v2r8( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar));
+    swV5             = gmx_fjsp_set1_v2r8( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
+    swF2             = gmx_fjsp_set1_v2r8(-30.0/(d_scalar*d_scalar*d_scalar));
+    swF3             = gmx_fjsp_set1_v2r8( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar));
+    swF4             = gmx_fjsp_set1_v2r8(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_fjsp_load_shift_and_3rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
+
+        fix0             = _fjsp_setzero_v2r8();
+        fiy0             = _fjsp_setzero_v2r8();
+        fiz0             = _fjsp_setzero_v2r8();
+        fix1             = _fjsp_setzero_v2r8();
+        fiy1             = _fjsp_setzero_v2r8();
+        fiz1             = _fjsp_setzero_v2r8();
+        fix2             = _fjsp_setzero_v2r8();
+        fiy2             = _fjsp_setzero_v2r8();
+        fiz2             = _fjsp_setzero_v2r8();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+            rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+            rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_fjsp_load_2real_swizzle_v2r8(charge+jnrA+0,charge+jnrB+0);
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
+            {
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _fjsp_mul_v2r8(iq0,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r00,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq00,_fjsp_sub_v2r8(rinv00,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq00,rinv00),_fjsp_sub_v2r8(rinvsq00,felec));
+
+            d                = _fjsp_sub_v2r8(r00,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv00,_fjsp_mul_v2r8(velec,dsw)) );
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq10,rcutoff2))
+            {
+
+            r10              = _fjsp_mul_v2r8(rsq10,rinv10);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _fjsp_mul_v2r8(iq1,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r10,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq10,_fjsp_sub_v2r8(rinv10,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq10,rinv10),_fjsp_sub_v2r8(rinvsq10,felec));
+
+            d                = _fjsp_sub_v2r8(r10,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv10,_fjsp_mul_v2r8(velec,dsw)) );
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq10,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq20,rcutoff2))
+            {
+
+            r20              = _fjsp_mul_v2r8(rsq20,rinv20);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _fjsp_mul_v2r8(iq2,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r20,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq20,_fjsp_sub_v2r8(rinv20,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq20,rinv20),_fjsp_sub_v2r8(rinvsq20,felec));
+
+            d                = _fjsp_sub_v2r8(r20,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv20,_fjsp_mul_v2r8(velec,dsw)) );
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq20,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            }
+
+            gmx_fjsp_decrement_1rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 198 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+            rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+            rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+
+            /* Load parameters for j particles */
+            jq0              = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),charge+jnrA+0);
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
+            {
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _fjsp_mul_v2r8(iq0,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r00,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq00,_fjsp_sub_v2r8(rinv00,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq00,rinv00),_fjsp_sub_v2r8(rinvsq00,felec));
+
+            d                = _fjsp_sub_v2r8(r00,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv00,_fjsp_mul_v2r8(velec,dsw)) );
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq10,rcutoff2))
+            {
+
+            r10              = _fjsp_mul_v2r8(rsq10,rinv10);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _fjsp_mul_v2r8(iq1,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r10,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq10,_fjsp_sub_v2r8(rinv10,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq10,rinv10),_fjsp_sub_v2r8(rinvsq10,felec));
+
+            d                = _fjsp_sub_v2r8(r10,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv10,_fjsp_mul_v2r8(velec,dsw)) );
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq10,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq20,rcutoff2))
+            {
+
+            r20              = _fjsp_mul_v2r8(rsq20,rinv20);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _fjsp_mul_v2r8(iq2,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r20,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq20,_fjsp_sub_v2r8(rinv20,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq20,rinv20),_fjsp_sub_v2r8(rinvsq20,felec));
+
+            d                = _fjsp_sub_v2r8(r20,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv20,_fjsp_mul_v2r8(velec,dsw)) );
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq20,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            }
+
+            gmx_fjsp_decrement_1rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 198 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_fjsp_update_iforce_3atom_swizzle_v2r8(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 18 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W3_F,outeriter*18 + inneriter*198);
+}
diff --git a/src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecEwSw_VdwNone_GeomW3W3_sparc64_hpc_ace_double.c b/src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecEwSw_VdwNone_GeomW3W3_sparc64_hpc_ace_double.c
new file mode 100644 (file)
index 0000000..a176d61
--- /dev/null
@@ -0,0 +1,2864 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2012, by the GROMACS development team, led by
+ * David van der Spoel, Berk Hess, Erik Lindahl, and including many
+ * others, as listed in the AUTHORS file in the top-level source
+ * directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*
+ * Note: this file was generated by the GROMACS sparc64_hpc_ace_double kernel generator.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "kernelutil_sparc64_hpc_ace_double.h"
+
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecEwSw_VdwNone_GeomW3W3_VF_sparc64_hpc_ace_double
+ * Electrostatics interaction: Ewald
+ * VdW interaction:            None
+ * Geometry:                   Water3-Water3
+ * Calculate force/pot:        PotentialAndForce
+ */
+void
+nb_kernel_ElecEwSw_VdwNone_GeomW3W3_VF_sparc64_hpc_ace_double
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwjidx0A,vdwjidx0B;
+    _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    int              vdwjidx1A,vdwjidx1B;
+    _fjsp_v2r8       jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+    int              vdwjidx2A,vdwjidx2B;
+    _fjsp_v2r8       jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+    _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    _fjsp_v2r8       dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
+    _fjsp_v2r8       dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
+    _fjsp_v2r8       dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    _fjsp_v2r8       dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+    _fjsp_v2r8       dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+    _fjsp_v2r8       dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    _fjsp_v2r8       dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+    _fjsp_v2r8       dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+    _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    _fjsp_v2r8       ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+    real             *ewtab;
+    _fjsp_v2r8       rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
+    real             rswitch_scalar,d_scalar;
+    _fjsp_v2r8       itab_tmp;
+    _fjsp_v2r8       dummy_mask,cutoff_mask;
+    _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+    _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+    union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+    charge           = mdatoms->chargeA;
+
+    sh_ewald         = gmx_fjsp_set1_v2r8(fr->ic->sh_ewald);
+    ewtab            = fr->ic->tabq_coul_FDV0;
+    ewtabscale       = gmx_fjsp_set1_v2r8(fr->ic->tabq_scale);
+    ewtabhalfspace   = gmx_fjsp_set1_v2r8(0.5/fr->ic->tabq_scale);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq0              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+0]));
+    iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+    iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+
+    jq0              = gmx_fjsp_set1_v2r8(charge[inr+0]);
+    jq1              = gmx_fjsp_set1_v2r8(charge[inr+1]);
+    jq2              = gmx_fjsp_set1_v2r8(charge[inr+2]);
+    qq00             = _fjsp_mul_v2r8(iq0,jq0);
+    qq01             = _fjsp_mul_v2r8(iq0,jq1);
+    qq02             = _fjsp_mul_v2r8(iq0,jq2);
+    qq10             = _fjsp_mul_v2r8(iq1,jq0);
+    qq11             = _fjsp_mul_v2r8(iq1,jq1);
+    qq12             = _fjsp_mul_v2r8(iq1,jq2);
+    qq20             = _fjsp_mul_v2r8(iq2,jq0);
+    qq21             = _fjsp_mul_v2r8(iq2,jq1);
+    qq22             = _fjsp_mul_v2r8(iq2,jq2);
+
+    /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+    rcutoff_scalar   = fr->rcoulomb;
+    rcutoff          = gmx_fjsp_set1_v2r8(rcutoff_scalar);
+    rcutoff2         = _fjsp_mul_v2r8(rcutoff,rcutoff);
+
+    rswitch_scalar   = fr->rcoulomb_switch;
+    rswitch          = gmx_fjsp_set1_v2r8(rswitch_scalar);
+    /* Setup switch parameters */
+    d_scalar         = rcutoff_scalar-rswitch_scalar;
+    d                = gmx_fjsp_set1_v2r8(d_scalar);
+    swV3             = gmx_fjsp_set1_v2r8(-10.0/(d_scalar*d_scalar*d_scalar));
+    swV4             = gmx_fjsp_set1_v2r8( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar));
+    swV5             = gmx_fjsp_set1_v2r8( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
+    swF2             = gmx_fjsp_set1_v2r8(-30.0/(d_scalar*d_scalar*d_scalar));
+    swF3             = gmx_fjsp_set1_v2r8( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar));
+    swF4             = gmx_fjsp_set1_v2r8(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_fjsp_load_shift_and_3rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
+
+        fix0             = _fjsp_setzero_v2r8();
+        fiy0             = _fjsp_setzero_v2r8();
+        fiz0             = _fjsp_setzero_v2r8();
+        fix1             = _fjsp_setzero_v2r8();
+        fiy1             = _fjsp_setzero_v2r8();
+        fiz1             = _fjsp_setzero_v2r8();
+        fix2             = _fjsp_setzero_v2r8();
+        fiy2             = _fjsp_setzero_v2r8();
+        fiz2             = _fjsp_setzero_v2r8();
+
+        /* Reset potential sums */
+        velecsum         = _fjsp_setzero_v2r8();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_3rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx01             = _fjsp_sub_v2r8(ix0,jx1);
+            dy01             = _fjsp_sub_v2r8(iy0,jy1);
+            dz01             = _fjsp_sub_v2r8(iz0,jz1);
+            dx02             = _fjsp_sub_v2r8(ix0,jx2);
+            dy02             = _fjsp_sub_v2r8(iy0,jy2);
+            dz02             = _fjsp_sub_v2r8(iz0,jz2);
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx11             = _fjsp_sub_v2r8(ix1,jx1);
+            dy11             = _fjsp_sub_v2r8(iy1,jy1);
+            dz11             = _fjsp_sub_v2r8(iz1,jz1);
+            dx12             = _fjsp_sub_v2r8(ix1,jx2);
+            dy12             = _fjsp_sub_v2r8(iy1,jy2);
+            dz12             = _fjsp_sub_v2r8(iz1,jz2);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+            dx21             = _fjsp_sub_v2r8(ix2,jx1);
+            dy21             = _fjsp_sub_v2r8(iy2,jy1);
+            dz21             = _fjsp_sub_v2r8(iz2,jz1);
+            dx22             = _fjsp_sub_v2r8(ix2,jx2);
+            dy22             = _fjsp_sub_v2r8(iy2,jy2);
+            dz22             = _fjsp_sub_v2r8(iz2,jz2);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq01            = gmx_fjsp_calc_rsq_v2r8(dx01,dy01,dz01);
+            rsq02            = gmx_fjsp_calc_rsq_v2r8(dx02,dy02,dz02);
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+            rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+            rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+            rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv01           = gmx_fjsp_invsqrt_v2r8(rsq01);
+            rinv02           = gmx_fjsp_invsqrt_v2r8(rsq02);
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+            rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+            rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+            rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+            rinvsq01         = _fjsp_mul_v2r8(rinv01,rinv01);
+            rinvsq02         = _fjsp_mul_v2r8(rinv02,rinv02);
+            rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+            rinvsq11         = _fjsp_mul_v2r8(rinv11,rinv11);
+            rinvsq12         = _fjsp_mul_v2r8(rinv12,rinv12);
+            rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+            rinvsq21         = _fjsp_mul_v2r8(rinv21,rinv21);
+            rinvsq22         = _fjsp_mul_v2r8(rinv22,rinv22);
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+            fjx1             = _fjsp_setzero_v2r8();
+            fjy1             = _fjsp_setzero_v2r8();
+            fjz1             = _fjsp_setzero_v2r8();
+            fjx2             = _fjsp_setzero_v2r8();
+            fjy2             = _fjsp_setzero_v2r8();
+            fjz2             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
+            {
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r00,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq00,_fjsp_sub_v2r8(rinv00,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq00,rinv00),_fjsp_sub_v2r8(rinvsq00,felec));
+
+            d                = _fjsp_sub_v2r8(r00,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv00,_fjsp_mul_v2r8(velec,dsw)) );
+            velec            = _fjsp_mul_v2r8(velec,sw);
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq01,rcutoff2))
+            {
+
+            r01              = _fjsp_mul_v2r8(rsq01,rinv01);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r01,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq01,_fjsp_sub_v2r8(rinv01,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq01,rinv01),_fjsp_sub_v2r8(rinvsq01,felec));
+
+            d                = _fjsp_sub_v2r8(r01,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv01,_fjsp_mul_v2r8(velec,dsw)) );
+            velec            = _fjsp_mul_v2r8(velec,sw);
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq01,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx01,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy01,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz01,fscal,fiz0);
+            
+            fjx1             = _fjsp_madd_v2r8(dx01,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy01,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz01,fscal,fjz1);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq02,rcutoff2))
+            {
+
+            r02              = _fjsp_mul_v2r8(rsq02,rinv02);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r02,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq02,_fjsp_sub_v2r8(rinv02,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq02,rinv02),_fjsp_sub_v2r8(rinvsq02,felec));
+
+            d                = _fjsp_sub_v2r8(r02,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv02,_fjsp_mul_v2r8(velec,dsw)) );
+            velec            = _fjsp_mul_v2r8(velec,sw);
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq02,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx02,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy02,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz02,fscal,fiz0);
+            
+            fjx2             = _fjsp_madd_v2r8(dx02,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy02,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz02,fscal,fjz2);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq10,rcutoff2))
+            {
+
+            r10              = _fjsp_mul_v2r8(rsq10,rinv10);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r10,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq10,_fjsp_sub_v2r8(rinv10,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq10,rinv10),_fjsp_sub_v2r8(rinvsq10,felec));
+
+            d                = _fjsp_sub_v2r8(r10,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv10,_fjsp_mul_v2r8(velec,dsw)) );
+            velec            = _fjsp_mul_v2r8(velec,sw);
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq10,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq11,rcutoff2))
+            {
+
+            r11              = _fjsp_mul_v2r8(rsq11,rinv11);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r11,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq11,_fjsp_sub_v2r8(rinv11,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq11,rinv11),_fjsp_sub_v2r8(rinvsq11,felec));
+
+            d                = _fjsp_sub_v2r8(r11,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv11,_fjsp_mul_v2r8(velec,dsw)) );
+            velec            = _fjsp_mul_v2r8(velec,sw);
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq11,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+            
+            fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq12,rcutoff2))
+            {
+
+            r12              = _fjsp_mul_v2r8(rsq12,rinv12);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r12,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq12,_fjsp_sub_v2r8(rinv12,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq12,rinv12),_fjsp_sub_v2r8(rinvsq12,felec));
+
+            d                = _fjsp_sub_v2r8(r12,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv12,_fjsp_mul_v2r8(velec,dsw)) );
+            velec            = _fjsp_mul_v2r8(velec,sw);
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq12,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+            
+            fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq20,rcutoff2))
+            {
+
+            r20              = _fjsp_mul_v2r8(rsq20,rinv20);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r20,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq20,_fjsp_sub_v2r8(rinv20,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq20,rinv20),_fjsp_sub_v2r8(rinvsq20,felec));
+
+            d                = _fjsp_sub_v2r8(r20,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv20,_fjsp_mul_v2r8(velec,dsw)) );
+            velec            = _fjsp_mul_v2r8(velec,sw);
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq20,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq21,rcutoff2))
+            {
+
+            r21              = _fjsp_mul_v2r8(rsq21,rinv21);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r21,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq21,_fjsp_sub_v2r8(rinv21,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq21,rinv21),_fjsp_sub_v2r8(rinvsq21,felec));
+
+            d                = _fjsp_sub_v2r8(r21,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv21,_fjsp_mul_v2r8(velec,dsw)) );
+            velec            = _fjsp_mul_v2r8(velec,sw);
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq21,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+            
+            fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq22,rcutoff2))
+            {
+
+            r22              = _fjsp_mul_v2r8(rsq22,rinv22);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r22,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq22,_fjsp_sub_v2r8(rinv22,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq22,rinv22),_fjsp_sub_v2r8(rinvsq22,felec));
+
+            d                = _fjsp_sub_v2r8(r22,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv22,_fjsp_mul_v2r8(velec,dsw)) );
+            velec            = _fjsp_mul_v2r8(velec,sw);
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq22,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+            
+            fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+
+            }
+
+            gmx_fjsp_decrement_3rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+
+            /* Inner loop uses 612 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_3rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx01             = _fjsp_sub_v2r8(ix0,jx1);
+            dy01             = _fjsp_sub_v2r8(iy0,jy1);
+            dz01             = _fjsp_sub_v2r8(iz0,jz1);
+            dx02             = _fjsp_sub_v2r8(ix0,jx2);
+            dy02             = _fjsp_sub_v2r8(iy0,jy2);
+            dz02             = _fjsp_sub_v2r8(iz0,jz2);
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx11             = _fjsp_sub_v2r8(ix1,jx1);
+            dy11             = _fjsp_sub_v2r8(iy1,jy1);
+            dz11             = _fjsp_sub_v2r8(iz1,jz1);
+            dx12             = _fjsp_sub_v2r8(ix1,jx2);
+            dy12             = _fjsp_sub_v2r8(iy1,jy2);
+            dz12             = _fjsp_sub_v2r8(iz1,jz2);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+            dx21             = _fjsp_sub_v2r8(ix2,jx1);
+            dy21             = _fjsp_sub_v2r8(iy2,jy1);
+            dz21             = _fjsp_sub_v2r8(iz2,jz1);
+            dx22             = _fjsp_sub_v2r8(ix2,jx2);
+            dy22             = _fjsp_sub_v2r8(iy2,jy2);
+            dz22             = _fjsp_sub_v2r8(iz2,jz2);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq01            = gmx_fjsp_calc_rsq_v2r8(dx01,dy01,dz01);
+            rsq02            = gmx_fjsp_calc_rsq_v2r8(dx02,dy02,dz02);
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+            rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+            rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+            rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv01           = gmx_fjsp_invsqrt_v2r8(rsq01);
+            rinv02           = gmx_fjsp_invsqrt_v2r8(rsq02);
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+            rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+            rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+            rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+            rinvsq01         = _fjsp_mul_v2r8(rinv01,rinv01);
+            rinvsq02         = _fjsp_mul_v2r8(rinv02,rinv02);
+            rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+            rinvsq11         = _fjsp_mul_v2r8(rinv11,rinv11);
+            rinvsq12         = _fjsp_mul_v2r8(rinv12,rinv12);
+            rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+            rinvsq21         = _fjsp_mul_v2r8(rinv21,rinv21);
+            rinvsq22         = _fjsp_mul_v2r8(rinv22,rinv22);
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+            fjx1             = _fjsp_setzero_v2r8();
+            fjy1             = _fjsp_setzero_v2r8();
+            fjz1             = _fjsp_setzero_v2r8();
+            fjx2             = _fjsp_setzero_v2r8();
+            fjy2             = _fjsp_setzero_v2r8();
+            fjz2             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
+            {
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r00,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq00,_fjsp_sub_v2r8(rinv00,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq00,rinv00),_fjsp_sub_v2r8(rinvsq00,felec));
+
+            d                = _fjsp_sub_v2r8(r00,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv00,_fjsp_mul_v2r8(velec,dsw)) );
+            velec            = _fjsp_mul_v2r8(velec,sw);
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq01,rcutoff2))
+            {
+
+            r01              = _fjsp_mul_v2r8(rsq01,rinv01);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r01,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq01,_fjsp_sub_v2r8(rinv01,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq01,rinv01),_fjsp_sub_v2r8(rinvsq01,felec));
+
+            d                = _fjsp_sub_v2r8(r01,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv01,_fjsp_mul_v2r8(velec,dsw)) );
+            velec            = _fjsp_mul_v2r8(velec,sw);
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq01,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx01,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy01,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz01,fscal,fiz0);
+            
+            fjx1             = _fjsp_madd_v2r8(dx01,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy01,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz01,fscal,fjz1);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq02,rcutoff2))
+            {
+
+            r02              = _fjsp_mul_v2r8(rsq02,rinv02);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r02,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq02,_fjsp_sub_v2r8(rinv02,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq02,rinv02),_fjsp_sub_v2r8(rinvsq02,felec));
+
+            d                = _fjsp_sub_v2r8(r02,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv02,_fjsp_mul_v2r8(velec,dsw)) );
+            velec            = _fjsp_mul_v2r8(velec,sw);
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq02,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx02,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy02,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz02,fscal,fiz0);
+            
+            fjx2             = _fjsp_madd_v2r8(dx02,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy02,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz02,fscal,fjz2);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq10,rcutoff2))
+            {
+
+            r10              = _fjsp_mul_v2r8(rsq10,rinv10);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r10,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq10,_fjsp_sub_v2r8(rinv10,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq10,rinv10),_fjsp_sub_v2r8(rinvsq10,felec));
+
+            d                = _fjsp_sub_v2r8(r10,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv10,_fjsp_mul_v2r8(velec,dsw)) );
+            velec            = _fjsp_mul_v2r8(velec,sw);
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq10,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq11,rcutoff2))
+            {
+
+            r11              = _fjsp_mul_v2r8(rsq11,rinv11);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r11,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq11,_fjsp_sub_v2r8(rinv11,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq11,rinv11),_fjsp_sub_v2r8(rinvsq11,felec));
+
+            d                = _fjsp_sub_v2r8(r11,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv11,_fjsp_mul_v2r8(velec,dsw)) );
+            velec            = _fjsp_mul_v2r8(velec,sw);
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq11,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+            
+            fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq12,rcutoff2))
+            {
+
+            r12              = _fjsp_mul_v2r8(rsq12,rinv12);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r12,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq12,_fjsp_sub_v2r8(rinv12,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq12,rinv12),_fjsp_sub_v2r8(rinvsq12,felec));
+
+            d                = _fjsp_sub_v2r8(r12,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv12,_fjsp_mul_v2r8(velec,dsw)) );
+            velec            = _fjsp_mul_v2r8(velec,sw);
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq12,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+            
+            fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq20,rcutoff2))
+            {
+
+            r20              = _fjsp_mul_v2r8(rsq20,rinv20);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r20,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq20,_fjsp_sub_v2r8(rinv20,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq20,rinv20),_fjsp_sub_v2r8(rinvsq20,felec));
+
+            d                = _fjsp_sub_v2r8(r20,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv20,_fjsp_mul_v2r8(velec,dsw)) );
+            velec            = _fjsp_mul_v2r8(velec,sw);
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq20,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq21,rcutoff2))
+            {
+
+            r21              = _fjsp_mul_v2r8(rsq21,rinv21);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r21,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq21,_fjsp_sub_v2r8(rinv21,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq21,rinv21),_fjsp_sub_v2r8(rinvsq21,felec));
+
+            d                = _fjsp_sub_v2r8(r21,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv21,_fjsp_mul_v2r8(velec,dsw)) );
+            velec            = _fjsp_mul_v2r8(velec,sw);
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq21,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+            
+            fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq22,rcutoff2))
+            {
+
+            r22              = _fjsp_mul_v2r8(rsq22,rinv22);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r22,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq22,_fjsp_sub_v2r8(rinv22,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq22,rinv22),_fjsp_sub_v2r8(rinvsq22,felec));
+
+            d                = _fjsp_sub_v2r8(r22,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv22,_fjsp_mul_v2r8(velec,dsw)) );
+            velec            = _fjsp_mul_v2r8(velec,sw);
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq22,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+            
+            fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+
+            }
+
+            gmx_fjsp_decrement_3rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+
+            /* Inner loop uses 612 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_fjsp_update_iforce_3atom_swizzle_v2r8(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        ggid                        = gid[iidx];
+        /* Update potential energies */
+        gmx_fjsp_update_1pot_v2r8(velecsum,kernel_data->energygrp_elec+ggid);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 19 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W3W3_VF,outeriter*19 + inneriter*612);
+}
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecEwSw_VdwNone_GeomW3W3_F_sparc64_hpc_ace_double
+ * Electrostatics interaction: Ewald
+ * VdW interaction:            None
+ * Geometry:                   Water3-Water3
+ * Calculate force/pot:        Force
+ */
+void
+nb_kernel_ElecEwSw_VdwNone_GeomW3W3_F_sparc64_hpc_ace_double
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwjidx0A,vdwjidx0B;
+    _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    int              vdwjidx1A,vdwjidx1B;
+    _fjsp_v2r8       jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+    int              vdwjidx2A,vdwjidx2B;
+    _fjsp_v2r8       jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+    _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    _fjsp_v2r8       dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
+    _fjsp_v2r8       dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
+    _fjsp_v2r8       dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    _fjsp_v2r8       dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+    _fjsp_v2r8       dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+    _fjsp_v2r8       dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    _fjsp_v2r8       dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+    _fjsp_v2r8       dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+    _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    _fjsp_v2r8       ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+    real             *ewtab;
+    _fjsp_v2r8       rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
+    real             rswitch_scalar,d_scalar;
+    _fjsp_v2r8       itab_tmp;
+    _fjsp_v2r8       dummy_mask,cutoff_mask;
+    _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+    _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+    union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+    charge           = mdatoms->chargeA;
+
+    sh_ewald         = gmx_fjsp_set1_v2r8(fr->ic->sh_ewald);
+    ewtab            = fr->ic->tabq_coul_FDV0;
+    ewtabscale       = gmx_fjsp_set1_v2r8(fr->ic->tabq_scale);
+    ewtabhalfspace   = gmx_fjsp_set1_v2r8(0.5/fr->ic->tabq_scale);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq0              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+0]));
+    iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+    iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+
+    jq0              = gmx_fjsp_set1_v2r8(charge[inr+0]);
+    jq1              = gmx_fjsp_set1_v2r8(charge[inr+1]);
+    jq2              = gmx_fjsp_set1_v2r8(charge[inr+2]);
+    qq00             = _fjsp_mul_v2r8(iq0,jq0);
+    qq01             = _fjsp_mul_v2r8(iq0,jq1);
+    qq02             = _fjsp_mul_v2r8(iq0,jq2);
+    qq10             = _fjsp_mul_v2r8(iq1,jq0);
+    qq11             = _fjsp_mul_v2r8(iq1,jq1);
+    qq12             = _fjsp_mul_v2r8(iq1,jq2);
+    qq20             = _fjsp_mul_v2r8(iq2,jq0);
+    qq21             = _fjsp_mul_v2r8(iq2,jq1);
+    qq22             = _fjsp_mul_v2r8(iq2,jq2);
+
+    /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+    rcutoff_scalar   = fr->rcoulomb;
+    rcutoff          = gmx_fjsp_set1_v2r8(rcutoff_scalar);
+    rcutoff2         = _fjsp_mul_v2r8(rcutoff,rcutoff);
+
+    rswitch_scalar   = fr->rcoulomb_switch;
+    rswitch          = gmx_fjsp_set1_v2r8(rswitch_scalar);
+    /* Setup switch parameters */
+    d_scalar         = rcutoff_scalar-rswitch_scalar;
+    d                = gmx_fjsp_set1_v2r8(d_scalar);
+    swV3             = gmx_fjsp_set1_v2r8(-10.0/(d_scalar*d_scalar*d_scalar));
+    swV4             = gmx_fjsp_set1_v2r8( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar));
+    swV5             = gmx_fjsp_set1_v2r8( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
+    swF2             = gmx_fjsp_set1_v2r8(-30.0/(d_scalar*d_scalar*d_scalar));
+    swF3             = gmx_fjsp_set1_v2r8( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar));
+    swF4             = gmx_fjsp_set1_v2r8(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_fjsp_load_shift_and_3rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
+
+        fix0             = _fjsp_setzero_v2r8();
+        fiy0             = _fjsp_setzero_v2r8();
+        fiz0             = _fjsp_setzero_v2r8();
+        fix1             = _fjsp_setzero_v2r8();
+        fiy1             = _fjsp_setzero_v2r8();
+        fiz1             = _fjsp_setzero_v2r8();
+        fix2             = _fjsp_setzero_v2r8();
+        fiy2             = _fjsp_setzero_v2r8();
+        fiz2             = _fjsp_setzero_v2r8();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_3rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx01             = _fjsp_sub_v2r8(ix0,jx1);
+            dy01             = _fjsp_sub_v2r8(iy0,jy1);
+            dz01             = _fjsp_sub_v2r8(iz0,jz1);
+            dx02             = _fjsp_sub_v2r8(ix0,jx2);
+            dy02             = _fjsp_sub_v2r8(iy0,jy2);
+            dz02             = _fjsp_sub_v2r8(iz0,jz2);
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx11             = _fjsp_sub_v2r8(ix1,jx1);
+            dy11             = _fjsp_sub_v2r8(iy1,jy1);
+            dz11             = _fjsp_sub_v2r8(iz1,jz1);
+            dx12             = _fjsp_sub_v2r8(ix1,jx2);
+            dy12             = _fjsp_sub_v2r8(iy1,jy2);
+            dz12             = _fjsp_sub_v2r8(iz1,jz2);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+            dx21             = _fjsp_sub_v2r8(ix2,jx1);
+            dy21             = _fjsp_sub_v2r8(iy2,jy1);
+            dz21             = _fjsp_sub_v2r8(iz2,jz1);
+            dx22             = _fjsp_sub_v2r8(ix2,jx2);
+            dy22             = _fjsp_sub_v2r8(iy2,jy2);
+            dz22             = _fjsp_sub_v2r8(iz2,jz2);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq01            = gmx_fjsp_calc_rsq_v2r8(dx01,dy01,dz01);
+            rsq02            = gmx_fjsp_calc_rsq_v2r8(dx02,dy02,dz02);
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+            rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+            rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+            rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv01           = gmx_fjsp_invsqrt_v2r8(rsq01);
+            rinv02           = gmx_fjsp_invsqrt_v2r8(rsq02);
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+            rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+            rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+            rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+            rinvsq01         = _fjsp_mul_v2r8(rinv01,rinv01);
+            rinvsq02         = _fjsp_mul_v2r8(rinv02,rinv02);
+            rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+            rinvsq11         = _fjsp_mul_v2r8(rinv11,rinv11);
+            rinvsq12         = _fjsp_mul_v2r8(rinv12,rinv12);
+            rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+            rinvsq21         = _fjsp_mul_v2r8(rinv21,rinv21);
+            rinvsq22         = _fjsp_mul_v2r8(rinv22,rinv22);
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+            fjx1             = _fjsp_setzero_v2r8();
+            fjy1             = _fjsp_setzero_v2r8();
+            fjz1             = _fjsp_setzero_v2r8();
+            fjx2             = _fjsp_setzero_v2r8();
+            fjy2             = _fjsp_setzero_v2r8();
+            fjz2             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
+            {
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r00,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq00,_fjsp_sub_v2r8(rinv00,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq00,rinv00),_fjsp_sub_v2r8(rinvsq00,felec));
+
+            d                = _fjsp_sub_v2r8(r00,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv00,_fjsp_mul_v2r8(velec,dsw)) );
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq01,rcutoff2))
+            {
+
+            r01              = _fjsp_mul_v2r8(rsq01,rinv01);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r01,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq01,_fjsp_sub_v2r8(rinv01,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq01,rinv01),_fjsp_sub_v2r8(rinvsq01,felec));
+
+            d                = _fjsp_sub_v2r8(r01,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv01,_fjsp_mul_v2r8(velec,dsw)) );
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq01,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx01,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy01,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz01,fscal,fiz0);
+            
+            fjx1             = _fjsp_madd_v2r8(dx01,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy01,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz01,fscal,fjz1);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq02,rcutoff2))
+            {
+
+            r02              = _fjsp_mul_v2r8(rsq02,rinv02);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r02,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq02,_fjsp_sub_v2r8(rinv02,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq02,rinv02),_fjsp_sub_v2r8(rinvsq02,felec));
+
+            d                = _fjsp_sub_v2r8(r02,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv02,_fjsp_mul_v2r8(velec,dsw)) );
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq02,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx02,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy02,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz02,fscal,fiz0);
+            
+            fjx2             = _fjsp_madd_v2r8(dx02,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy02,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz02,fscal,fjz2);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq10,rcutoff2))
+            {
+
+            r10              = _fjsp_mul_v2r8(rsq10,rinv10);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r10,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq10,_fjsp_sub_v2r8(rinv10,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq10,rinv10),_fjsp_sub_v2r8(rinvsq10,felec));
+
+            d                = _fjsp_sub_v2r8(r10,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv10,_fjsp_mul_v2r8(velec,dsw)) );
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq10,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq11,rcutoff2))
+            {
+
+            r11              = _fjsp_mul_v2r8(rsq11,rinv11);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r11,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq11,_fjsp_sub_v2r8(rinv11,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq11,rinv11),_fjsp_sub_v2r8(rinvsq11,felec));
+
+            d                = _fjsp_sub_v2r8(r11,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv11,_fjsp_mul_v2r8(velec,dsw)) );
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq11,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+            
+            fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq12,rcutoff2))
+            {
+
+            r12              = _fjsp_mul_v2r8(rsq12,rinv12);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r12,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq12,_fjsp_sub_v2r8(rinv12,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq12,rinv12),_fjsp_sub_v2r8(rinvsq12,felec));
+
+            d                = _fjsp_sub_v2r8(r12,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv12,_fjsp_mul_v2r8(velec,dsw)) );
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq12,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+            
+            fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq20,rcutoff2))
+            {
+
+            r20              = _fjsp_mul_v2r8(rsq20,rinv20);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r20,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq20,_fjsp_sub_v2r8(rinv20,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq20,rinv20),_fjsp_sub_v2r8(rinvsq20,felec));
+
+            d                = _fjsp_sub_v2r8(r20,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv20,_fjsp_mul_v2r8(velec,dsw)) );
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq20,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq21,rcutoff2))
+            {
+
+            r21              = _fjsp_mul_v2r8(rsq21,rinv21);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r21,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq21,_fjsp_sub_v2r8(rinv21,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq21,rinv21),_fjsp_sub_v2r8(rinvsq21,felec));
+
+            d                = _fjsp_sub_v2r8(r21,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv21,_fjsp_mul_v2r8(velec,dsw)) );
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq21,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+            
+            fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq22,rcutoff2))
+            {
+
+            r22              = _fjsp_mul_v2r8(rsq22,rinv22);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r22,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq22,_fjsp_sub_v2r8(rinv22,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq22,rinv22),_fjsp_sub_v2r8(rinvsq22,felec));
+
+            d                = _fjsp_sub_v2r8(r22,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv22,_fjsp_mul_v2r8(velec,dsw)) );
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq22,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+            
+            fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+
+            }
+
+            gmx_fjsp_decrement_3rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+
+            /* Inner loop uses 585 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_3rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx01             = _fjsp_sub_v2r8(ix0,jx1);
+            dy01             = _fjsp_sub_v2r8(iy0,jy1);
+            dz01             = _fjsp_sub_v2r8(iz0,jz1);
+            dx02             = _fjsp_sub_v2r8(ix0,jx2);
+            dy02             = _fjsp_sub_v2r8(iy0,jy2);
+            dz02             = _fjsp_sub_v2r8(iz0,jz2);
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx11             = _fjsp_sub_v2r8(ix1,jx1);
+            dy11             = _fjsp_sub_v2r8(iy1,jy1);
+            dz11             = _fjsp_sub_v2r8(iz1,jz1);
+            dx12             = _fjsp_sub_v2r8(ix1,jx2);
+            dy12             = _fjsp_sub_v2r8(iy1,jy2);
+            dz12             = _fjsp_sub_v2r8(iz1,jz2);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+            dx21             = _fjsp_sub_v2r8(ix2,jx1);
+            dy21             = _fjsp_sub_v2r8(iy2,jy1);
+            dz21             = _fjsp_sub_v2r8(iz2,jz1);
+            dx22             = _fjsp_sub_v2r8(ix2,jx2);
+            dy22             = _fjsp_sub_v2r8(iy2,jy2);
+            dz22             = _fjsp_sub_v2r8(iz2,jz2);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq01            = gmx_fjsp_calc_rsq_v2r8(dx01,dy01,dz01);
+            rsq02            = gmx_fjsp_calc_rsq_v2r8(dx02,dy02,dz02);
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+            rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+            rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+            rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv01           = gmx_fjsp_invsqrt_v2r8(rsq01);
+            rinv02           = gmx_fjsp_invsqrt_v2r8(rsq02);
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+            rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+            rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+            rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+            rinvsq01         = _fjsp_mul_v2r8(rinv01,rinv01);
+            rinvsq02         = _fjsp_mul_v2r8(rinv02,rinv02);
+            rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+            rinvsq11         = _fjsp_mul_v2r8(rinv11,rinv11);
+            rinvsq12         = _fjsp_mul_v2r8(rinv12,rinv12);
+            rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+            rinvsq21         = _fjsp_mul_v2r8(rinv21,rinv21);
+            rinvsq22         = _fjsp_mul_v2r8(rinv22,rinv22);
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+            fjx1             = _fjsp_setzero_v2r8();
+            fjy1             = _fjsp_setzero_v2r8();
+            fjz1             = _fjsp_setzero_v2r8();
+            fjx2             = _fjsp_setzero_v2r8();
+            fjy2             = _fjsp_setzero_v2r8();
+            fjz2             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
+            {
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r00,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq00,_fjsp_sub_v2r8(rinv00,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq00,rinv00),_fjsp_sub_v2r8(rinvsq00,felec));
+
+            d                = _fjsp_sub_v2r8(r00,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv00,_fjsp_mul_v2r8(velec,dsw)) );
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq01,rcutoff2))
+            {
+
+            r01              = _fjsp_mul_v2r8(rsq01,rinv01);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r01,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq01,_fjsp_sub_v2r8(rinv01,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq01,rinv01),_fjsp_sub_v2r8(rinvsq01,felec));
+
+            d                = _fjsp_sub_v2r8(r01,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv01,_fjsp_mul_v2r8(velec,dsw)) );
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq01,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx01,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy01,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz01,fscal,fiz0);
+            
+            fjx1             = _fjsp_madd_v2r8(dx01,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy01,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz01,fscal,fjz1);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq02,rcutoff2))
+            {
+
+            r02              = _fjsp_mul_v2r8(rsq02,rinv02);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r02,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq02,_fjsp_sub_v2r8(rinv02,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq02,rinv02),_fjsp_sub_v2r8(rinvsq02,felec));
+
+            d                = _fjsp_sub_v2r8(r02,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv02,_fjsp_mul_v2r8(velec,dsw)) );
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq02,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx02,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy02,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz02,fscal,fiz0);
+            
+            fjx2             = _fjsp_madd_v2r8(dx02,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy02,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz02,fscal,fjz2);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq10,rcutoff2))
+            {
+
+            r10              = _fjsp_mul_v2r8(rsq10,rinv10);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r10,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq10,_fjsp_sub_v2r8(rinv10,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq10,rinv10),_fjsp_sub_v2r8(rinvsq10,felec));
+
+            d                = _fjsp_sub_v2r8(r10,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv10,_fjsp_mul_v2r8(velec,dsw)) );
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq10,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq11,rcutoff2))
+            {
+
+            r11              = _fjsp_mul_v2r8(rsq11,rinv11);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r11,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq11,_fjsp_sub_v2r8(rinv11,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq11,rinv11),_fjsp_sub_v2r8(rinvsq11,felec));
+
+            d                = _fjsp_sub_v2r8(r11,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv11,_fjsp_mul_v2r8(velec,dsw)) );
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq11,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+            
+            fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq12,rcutoff2))
+            {
+
+            r12              = _fjsp_mul_v2r8(rsq12,rinv12);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r12,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq12,_fjsp_sub_v2r8(rinv12,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq12,rinv12),_fjsp_sub_v2r8(rinvsq12,felec));
+
+            d                = _fjsp_sub_v2r8(r12,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv12,_fjsp_mul_v2r8(velec,dsw)) );
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq12,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+            
+            fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq20,rcutoff2))
+            {
+
+            r20              = _fjsp_mul_v2r8(rsq20,rinv20);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r20,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq20,_fjsp_sub_v2r8(rinv20,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq20,rinv20),_fjsp_sub_v2r8(rinvsq20,felec));
+
+            d                = _fjsp_sub_v2r8(r20,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv20,_fjsp_mul_v2r8(velec,dsw)) );
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq20,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq21,rcutoff2))
+            {
+
+            r21              = _fjsp_mul_v2r8(rsq21,rinv21);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r21,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq21,_fjsp_sub_v2r8(rinv21,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq21,rinv21),_fjsp_sub_v2r8(rinvsq21,felec));
+
+            d                = _fjsp_sub_v2r8(r21,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv21,_fjsp_mul_v2r8(velec,dsw)) );
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq21,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+            
+            fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq22,rcutoff2))
+            {
+
+            r22              = _fjsp_mul_v2r8(rsq22,rinv22);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r22,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq22,_fjsp_sub_v2r8(rinv22,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq22,rinv22),_fjsp_sub_v2r8(rinvsq22,felec));
+
+            d                = _fjsp_sub_v2r8(r22,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv22,_fjsp_mul_v2r8(velec,dsw)) );
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq22,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+            
+            fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+
+            }
+
+            gmx_fjsp_decrement_3rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+
+            /* Inner loop uses 585 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_fjsp_update_iforce_3atom_swizzle_v2r8(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 18 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W3W3_F,outeriter*18 + inneriter*585);
+}
diff --git a/src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecEwSw_VdwNone_GeomW4P1_sparc64_hpc_ace_double.c b/src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecEwSw_VdwNone_GeomW4P1_sparc64_hpc_ace_double.c
new file mode 100644 (file)
index 0000000..ff1300a
--- /dev/null
@@ -0,0 +1,1288 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2012, by the GROMACS development team, led by
+ * David van der Spoel, Berk Hess, Erik Lindahl, and including many
+ * others, as listed in the AUTHORS file in the top-level source
+ * directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*
+ * Note: this file was generated by the GROMACS sparc64_hpc_ace_double kernel generator.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "kernelutil_sparc64_hpc_ace_double.h"
+
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecEwSw_VdwNone_GeomW4P1_VF_sparc64_hpc_ace_double
+ * Electrostatics interaction: Ewald
+ * VdW interaction:            None
+ * Geometry:                   Water4-Particle
+ * Calculate force/pot:        PotentialAndForce
+ */
+void
+nb_kernel_ElecEwSw_VdwNone_GeomW4P1_VF_sparc64_hpc_ace_double
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset1;
+    _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwioffset3;
+    _fjsp_v2r8       ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+    int              vdwjidx0A,vdwjidx0B;
+    _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    _fjsp_v2r8       dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    _fjsp_v2r8       dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    _fjsp_v2r8       dx30,dy30,dz30,rsq30,rinv30,rinvsq30,r30,qq30,c6_30,c12_30;
+    _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    _fjsp_v2r8       ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+    real             *ewtab;
+    _fjsp_v2r8       rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
+    real             rswitch_scalar,d_scalar;
+    _fjsp_v2r8       itab_tmp;
+    _fjsp_v2r8       dummy_mask,cutoff_mask;
+    _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+    _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+    union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+    charge           = mdatoms->chargeA;
+
+    sh_ewald         = gmx_fjsp_set1_v2r8(fr->ic->sh_ewald);
+    ewtab            = fr->ic->tabq_coul_FDV0;
+    ewtabscale       = gmx_fjsp_set1_v2r8(fr->ic->tabq_scale);
+    ewtabhalfspace   = gmx_fjsp_set1_v2r8(0.5/fr->ic->tabq_scale);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+    iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+    iq3              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+3]));
+
+    /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+    rcutoff_scalar   = fr->rcoulomb;
+    rcutoff          = gmx_fjsp_set1_v2r8(rcutoff_scalar);
+    rcutoff2         = _fjsp_mul_v2r8(rcutoff,rcutoff);
+
+    rswitch_scalar   = fr->rcoulomb_switch;
+    rswitch          = gmx_fjsp_set1_v2r8(rswitch_scalar);
+    /* Setup switch parameters */
+    d_scalar         = rcutoff_scalar-rswitch_scalar;
+    d                = gmx_fjsp_set1_v2r8(d_scalar);
+    swV3             = gmx_fjsp_set1_v2r8(-10.0/(d_scalar*d_scalar*d_scalar));
+    swV4             = gmx_fjsp_set1_v2r8( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar));
+    swV5             = gmx_fjsp_set1_v2r8( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
+    swF2             = gmx_fjsp_set1_v2r8(-30.0/(d_scalar*d_scalar*d_scalar));
+    swF3             = gmx_fjsp_set1_v2r8( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar));
+    swF4             = gmx_fjsp_set1_v2r8(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_fjsp_load_shift_and_3rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset+DIM,
+                                                 &ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
+
+        fix1             = _fjsp_setzero_v2r8();
+        fiy1             = _fjsp_setzero_v2r8();
+        fiz1             = _fjsp_setzero_v2r8();
+        fix2             = _fjsp_setzero_v2r8();
+        fiy2             = _fjsp_setzero_v2r8();
+        fiz2             = _fjsp_setzero_v2r8();
+        fix3             = _fjsp_setzero_v2r8();
+        fiy3             = _fjsp_setzero_v2r8();
+        fiz3             = _fjsp_setzero_v2r8();
+
+        /* Reset potential sums */
+        velecsum         = _fjsp_setzero_v2r8();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+            dx30             = _fjsp_sub_v2r8(ix3,jx0);
+            dy30             = _fjsp_sub_v2r8(iy3,jy0);
+            dz30             = _fjsp_sub_v2r8(iz3,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+            rsq30            = gmx_fjsp_calc_rsq_v2r8(dx30,dy30,dz30);
+
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+            rinv30           = gmx_fjsp_invsqrt_v2r8(rsq30);
+
+            rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+            rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+            rinvsq30         = _fjsp_mul_v2r8(rinv30,rinv30);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_fjsp_load_2real_swizzle_v2r8(charge+jnrA+0,charge+jnrB+0);
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq10,rcutoff2))
+            {
+
+            r10              = _fjsp_mul_v2r8(rsq10,rinv10);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _fjsp_mul_v2r8(iq1,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r10,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq10,_fjsp_sub_v2r8(rinv10,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq10,rinv10),_fjsp_sub_v2r8(rinvsq10,felec));
+
+            d                = _fjsp_sub_v2r8(r10,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv10,_fjsp_mul_v2r8(velec,dsw)) );
+            velec            = _fjsp_mul_v2r8(velec,sw);
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq10,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq20,rcutoff2))
+            {
+
+            r20              = _fjsp_mul_v2r8(rsq20,rinv20);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _fjsp_mul_v2r8(iq2,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r20,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq20,_fjsp_sub_v2r8(rinv20,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq20,rinv20),_fjsp_sub_v2r8(rinvsq20,felec));
+
+            d                = _fjsp_sub_v2r8(r20,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv20,_fjsp_mul_v2r8(velec,dsw)) );
+            velec            = _fjsp_mul_v2r8(velec,sw);
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq20,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq30,rcutoff2))
+            {
+
+            r30              = _fjsp_mul_v2r8(rsq30,rinv30);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq30             = _fjsp_mul_v2r8(iq3,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r30,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq30,_fjsp_sub_v2r8(rinv30,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq30,rinv30),_fjsp_sub_v2r8(rinvsq30,felec));
+
+            d                = _fjsp_sub_v2r8(r30,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv30,_fjsp_mul_v2r8(velec,dsw)) );
+            velec            = _fjsp_mul_v2r8(velec,sw);
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq30,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx30,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy30,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz30,fscal,fiz3);
+            
+            fjx0             = _fjsp_madd_v2r8(dx30,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy30,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz30,fscal,fjz0);
+
+            }
+
+            gmx_fjsp_decrement_1rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 207 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+            dx30             = _fjsp_sub_v2r8(ix3,jx0);
+            dy30             = _fjsp_sub_v2r8(iy3,jy0);
+            dz30             = _fjsp_sub_v2r8(iz3,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+            rsq30            = gmx_fjsp_calc_rsq_v2r8(dx30,dy30,dz30);
+
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+            rinv30           = gmx_fjsp_invsqrt_v2r8(rsq30);
+
+            rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+            rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+            rinvsq30         = _fjsp_mul_v2r8(rinv30,rinv30);
+
+            /* Load parameters for j particles */
+            jq0              = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),charge+jnrA+0);
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq10,rcutoff2))
+            {
+
+            r10              = _fjsp_mul_v2r8(rsq10,rinv10);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _fjsp_mul_v2r8(iq1,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r10,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq10,_fjsp_sub_v2r8(rinv10,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq10,rinv10),_fjsp_sub_v2r8(rinvsq10,felec));
+
+            d                = _fjsp_sub_v2r8(r10,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv10,_fjsp_mul_v2r8(velec,dsw)) );
+            velec            = _fjsp_mul_v2r8(velec,sw);
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq10,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq20,rcutoff2))
+            {
+
+            r20              = _fjsp_mul_v2r8(rsq20,rinv20);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _fjsp_mul_v2r8(iq2,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r20,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq20,_fjsp_sub_v2r8(rinv20,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq20,rinv20),_fjsp_sub_v2r8(rinvsq20,felec));
+
+            d                = _fjsp_sub_v2r8(r20,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv20,_fjsp_mul_v2r8(velec,dsw)) );
+            velec            = _fjsp_mul_v2r8(velec,sw);
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq20,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq30,rcutoff2))
+            {
+
+            r30              = _fjsp_mul_v2r8(rsq30,rinv30);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq30             = _fjsp_mul_v2r8(iq3,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r30,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq30,_fjsp_sub_v2r8(rinv30,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq30,rinv30),_fjsp_sub_v2r8(rinvsq30,felec));
+
+            d                = _fjsp_sub_v2r8(r30,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv30,_fjsp_mul_v2r8(velec,dsw)) );
+            velec            = _fjsp_mul_v2r8(velec,sw);
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq30,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx30,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy30,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz30,fscal,fiz3);
+            
+            fjx0             = _fjsp_madd_v2r8(dx30,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy30,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz30,fscal,fjz0);
+
+            }
+
+            gmx_fjsp_decrement_1rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 207 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_fjsp_update_iforce_3atom_swizzle_v2r8(fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+                                              f+i_coord_offset+DIM,fshift+i_shift_offset);
+
+        ggid                        = gid[iidx];
+        /* Update potential energies */
+        gmx_fjsp_update_1pot_v2r8(velecsum,kernel_data->energygrp_elec+ggid);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 19 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W4_VF,outeriter*19 + inneriter*207);
+}
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecEwSw_VdwNone_GeomW4P1_F_sparc64_hpc_ace_double
+ * Electrostatics interaction: Ewald
+ * VdW interaction:            None
+ * Geometry:                   Water4-Particle
+ * Calculate force/pot:        Force
+ */
+void
+nb_kernel_ElecEwSw_VdwNone_GeomW4P1_F_sparc64_hpc_ace_double
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset1;
+    _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwioffset3;
+    _fjsp_v2r8       ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+    int              vdwjidx0A,vdwjidx0B;
+    _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    _fjsp_v2r8       dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    _fjsp_v2r8       dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    _fjsp_v2r8       dx30,dy30,dz30,rsq30,rinv30,rinvsq30,r30,qq30,c6_30,c12_30;
+    _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    _fjsp_v2r8       ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+    real             *ewtab;
+    _fjsp_v2r8       rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
+    real             rswitch_scalar,d_scalar;
+    _fjsp_v2r8       itab_tmp;
+    _fjsp_v2r8       dummy_mask,cutoff_mask;
+    _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+    _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+    union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+    charge           = mdatoms->chargeA;
+
+    sh_ewald         = gmx_fjsp_set1_v2r8(fr->ic->sh_ewald);
+    ewtab            = fr->ic->tabq_coul_FDV0;
+    ewtabscale       = gmx_fjsp_set1_v2r8(fr->ic->tabq_scale);
+    ewtabhalfspace   = gmx_fjsp_set1_v2r8(0.5/fr->ic->tabq_scale);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+    iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+    iq3              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+3]));
+
+    /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+    rcutoff_scalar   = fr->rcoulomb;
+    rcutoff          = gmx_fjsp_set1_v2r8(rcutoff_scalar);
+    rcutoff2         = _fjsp_mul_v2r8(rcutoff,rcutoff);
+
+    rswitch_scalar   = fr->rcoulomb_switch;
+    rswitch          = gmx_fjsp_set1_v2r8(rswitch_scalar);
+    /* Setup switch parameters */
+    d_scalar         = rcutoff_scalar-rswitch_scalar;
+    d                = gmx_fjsp_set1_v2r8(d_scalar);
+    swV3             = gmx_fjsp_set1_v2r8(-10.0/(d_scalar*d_scalar*d_scalar));
+    swV4             = gmx_fjsp_set1_v2r8( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar));
+    swV5             = gmx_fjsp_set1_v2r8( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
+    swF2             = gmx_fjsp_set1_v2r8(-30.0/(d_scalar*d_scalar*d_scalar));
+    swF3             = gmx_fjsp_set1_v2r8( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar));
+    swF4             = gmx_fjsp_set1_v2r8(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_fjsp_load_shift_and_3rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset+DIM,
+                                                 &ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
+
+        fix1             = _fjsp_setzero_v2r8();
+        fiy1             = _fjsp_setzero_v2r8();
+        fiz1             = _fjsp_setzero_v2r8();
+        fix2             = _fjsp_setzero_v2r8();
+        fiy2             = _fjsp_setzero_v2r8();
+        fiz2             = _fjsp_setzero_v2r8();
+        fix3             = _fjsp_setzero_v2r8();
+        fiy3             = _fjsp_setzero_v2r8();
+        fiz3             = _fjsp_setzero_v2r8();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+            dx30             = _fjsp_sub_v2r8(ix3,jx0);
+            dy30             = _fjsp_sub_v2r8(iy3,jy0);
+            dz30             = _fjsp_sub_v2r8(iz3,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+            rsq30            = gmx_fjsp_calc_rsq_v2r8(dx30,dy30,dz30);
+
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+            rinv30           = gmx_fjsp_invsqrt_v2r8(rsq30);
+
+            rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+            rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+            rinvsq30         = _fjsp_mul_v2r8(rinv30,rinv30);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_fjsp_load_2real_swizzle_v2r8(charge+jnrA+0,charge+jnrB+0);
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq10,rcutoff2))
+            {
+
+            r10              = _fjsp_mul_v2r8(rsq10,rinv10);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _fjsp_mul_v2r8(iq1,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r10,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq10,_fjsp_sub_v2r8(rinv10,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq10,rinv10),_fjsp_sub_v2r8(rinvsq10,felec));
+
+            d                = _fjsp_sub_v2r8(r10,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv10,_fjsp_mul_v2r8(velec,dsw)) );
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq10,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq20,rcutoff2))
+            {
+
+            r20              = _fjsp_mul_v2r8(rsq20,rinv20);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _fjsp_mul_v2r8(iq2,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r20,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq20,_fjsp_sub_v2r8(rinv20,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq20,rinv20),_fjsp_sub_v2r8(rinvsq20,felec));
+
+            d                = _fjsp_sub_v2r8(r20,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv20,_fjsp_mul_v2r8(velec,dsw)) );
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq20,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq30,rcutoff2))
+            {
+
+            r30              = _fjsp_mul_v2r8(rsq30,rinv30);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq30             = _fjsp_mul_v2r8(iq3,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r30,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq30,_fjsp_sub_v2r8(rinv30,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq30,rinv30),_fjsp_sub_v2r8(rinvsq30,felec));
+
+            d                = _fjsp_sub_v2r8(r30,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv30,_fjsp_mul_v2r8(velec,dsw)) );
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq30,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx30,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy30,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz30,fscal,fiz3);
+            
+            fjx0             = _fjsp_madd_v2r8(dx30,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy30,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz30,fscal,fjz0);
+
+            }
+
+            gmx_fjsp_decrement_1rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 198 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+            dx30             = _fjsp_sub_v2r8(ix3,jx0);
+            dy30             = _fjsp_sub_v2r8(iy3,jy0);
+            dz30             = _fjsp_sub_v2r8(iz3,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+            rsq30            = gmx_fjsp_calc_rsq_v2r8(dx30,dy30,dz30);
+
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+            rinv30           = gmx_fjsp_invsqrt_v2r8(rsq30);
+
+            rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+            rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+            rinvsq30         = _fjsp_mul_v2r8(rinv30,rinv30);
+
+            /* Load parameters for j particles */
+            jq0              = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),charge+jnrA+0);
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq10,rcutoff2))
+            {
+
+            r10              = _fjsp_mul_v2r8(rsq10,rinv10);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _fjsp_mul_v2r8(iq1,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r10,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq10,_fjsp_sub_v2r8(rinv10,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq10,rinv10),_fjsp_sub_v2r8(rinvsq10,felec));
+
+            d                = _fjsp_sub_v2r8(r10,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv10,_fjsp_mul_v2r8(velec,dsw)) );
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq10,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq20,rcutoff2))
+            {
+
+            r20              = _fjsp_mul_v2r8(rsq20,rinv20);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _fjsp_mul_v2r8(iq2,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r20,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq20,_fjsp_sub_v2r8(rinv20,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq20,rinv20),_fjsp_sub_v2r8(rinvsq20,felec));
+
+            d                = _fjsp_sub_v2r8(r20,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv20,_fjsp_mul_v2r8(velec,dsw)) );
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq20,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq30,rcutoff2))
+            {
+
+            r30              = _fjsp_mul_v2r8(rsq30,rinv30);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq30             = _fjsp_mul_v2r8(iq3,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r30,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq30,_fjsp_sub_v2r8(rinv30,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq30,rinv30),_fjsp_sub_v2r8(rinvsq30,felec));
+
+            d                = _fjsp_sub_v2r8(r30,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv30,_fjsp_mul_v2r8(velec,dsw)) );
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq30,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx30,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy30,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz30,fscal,fiz3);
+            
+            fjx0             = _fjsp_madd_v2r8(dx30,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy30,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz30,fscal,fjz0);
+
+            }
+
+            gmx_fjsp_decrement_1rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 198 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_fjsp_update_iforce_3atom_swizzle_v2r8(fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+                                              f+i_coord_offset+DIM,fshift+i_shift_offset);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 18 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W4_F,outeriter*18 + inneriter*198);
+}
diff --git a/src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecEwSw_VdwNone_GeomW4W4_sparc64_hpc_ace_double.c b/src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecEwSw_VdwNone_GeomW4W4_sparc64_hpc_ace_double.c
new file mode 100644 (file)
index 0000000..66de2dd
--- /dev/null
@@ -0,0 +1,2864 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2012, by the GROMACS development team, led by
+ * David van der Spoel, Berk Hess, Erik Lindahl, and including many
+ * others, as listed in the AUTHORS file in the top-level source
+ * directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*
+ * Note: this file was generated by the GROMACS sparc64_hpc_ace_double kernel generator.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "kernelutil_sparc64_hpc_ace_double.h"
+
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecEwSw_VdwNone_GeomW4W4_VF_sparc64_hpc_ace_double
+ * Electrostatics interaction: Ewald
+ * VdW interaction:            None
+ * Geometry:                   Water4-Water4
+ * Calculate force/pot:        PotentialAndForce
+ */
+void
+nb_kernel_ElecEwSw_VdwNone_GeomW4W4_VF_sparc64_hpc_ace_double
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset1;
+    _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwioffset3;
+    _fjsp_v2r8       ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+    int              vdwjidx1A,vdwjidx1B;
+    _fjsp_v2r8       jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+    int              vdwjidx2A,vdwjidx2B;
+    _fjsp_v2r8       jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+    int              vdwjidx3A,vdwjidx3B;
+    _fjsp_v2r8       jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
+    _fjsp_v2r8       dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+    _fjsp_v2r8       dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+    _fjsp_v2r8       dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
+    _fjsp_v2r8       dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+    _fjsp_v2r8       dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+    _fjsp_v2r8       dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
+    _fjsp_v2r8       dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
+    _fjsp_v2r8       dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
+    _fjsp_v2r8       dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
+    _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    _fjsp_v2r8       ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+    real             *ewtab;
+    _fjsp_v2r8       rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
+    real             rswitch_scalar,d_scalar;
+    _fjsp_v2r8       itab_tmp;
+    _fjsp_v2r8       dummy_mask,cutoff_mask;
+    _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+    _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+    union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+    charge           = mdatoms->chargeA;
+
+    sh_ewald         = gmx_fjsp_set1_v2r8(fr->ic->sh_ewald);
+    ewtab            = fr->ic->tabq_coul_FDV0;
+    ewtabscale       = gmx_fjsp_set1_v2r8(fr->ic->tabq_scale);
+    ewtabhalfspace   = gmx_fjsp_set1_v2r8(0.5/fr->ic->tabq_scale);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+    iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+    iq3              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+3]));
+
+    jq1              = gmx_fjsp_set1_v2r8(charge[inr+1]);
+    jq2              = gmx_fjsp_set1_v2r8(charge[inr+2]);
+    jq3              = gmx_fjsp_set1_v2r8(charge[inr+3]);
+    qq11             = _fjsp_mul_v2r8(iq1,jq1);
+    qq12             = _fjsp_mul_v2r8(iq1,jq2);
+    qq13             = _fjsp_mul_v2r8(iq1,jq3);
+    qq21             = _fjsp_mul_v2r8(iq2,jq1);
+    qq22             = _fjsp_mul_v2r8(iq2,jq2);
+    qq23             = _fjsp_mul_v2r8(iq2,jq3);
+    qq31             = _fjsp_mul_v2r8(iq3,jq1);
+    qq32             = _fjsp_mul_v2r8(iq3,jq2);
+    qq33             = _fjsp_mul_v2r8(iq3,jq3);
+
+    /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+    rcutoff_scalar   = fr->rcoulomb;
+    rcutoff          = gmx_fjsp_set1_v2r8(rcutoff_scalar);
+    rcutoff2         = _fjsp_mul_v2r8(rcutoff,rcutoff);
+
+    rswitch_scalar   = fr->rcoulomb_switch;
+    rswitch          = gmx_fjsp_set1_v2r8(rswitch_scalar);
+    /* Setup switch parameters */
+    d_scalar         = rcutoff_scalar-rswitch_scalar;
+    d                = gmx_fjsp_set1_v2r8(d_scalar);
+    swV3             = gmx_fjsp_set1_v2r8(-10.0/(d_scalar*d_scalar*d_scalar));
+    swV4             = gmx_fjsp_set1_v2r8( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar));
+    swV5             = gmx_fjsp_set1_v2r8( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
+    swF2             = gmx_fjsp_set1_v2r8(-30.0/(d_scalar*d_scalar*d_scalar));
+    swF3             = gmx_fjsp_set1_v2r8( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar));
+    swF4             = gmx_fjsp_set1_v2r8(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_fjsp_load_shift_and_3rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset+DIM,
+                                                 &ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
+
+        fix1             = _fjsp_setzero_v2r8();
+        fiy1             = _fjsp_setzero_v2r8();
+        fiz1             = _fjsp_setzero_v2r8();
+        fix2             = _fjsp_setzero_v2r8();
+        fiy2             = _fjsp_setzero_v2r8();
+        fiz2             = _fjsp_setzero_v2r8();
+        fix3             = _fjsp_setzero_v2r8();
+        fiy3             = _fjsp_setzero_v2r8();
+        fiz3             = _fjsp_setzero_v2r8();
+
+        /* Reset potential sums */
+        velecsum         = _fjsp_setzero_v2r8();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_3rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA+DIM,x+j_coord_offsetB+DIM,
+                                              &jx1,&jy1,&jz1,&jx2,&jy2,&jz2,&jx3,&jy3,&jz3);
+
+            /* Calculate displacement vector */
+            dx11             = _fjsp_sub_v2r8(ix1,jx1);
+            dy11             = _fjsp_sub_v2r8(iy1,jy1);
+            dz11             = _fjsp_sub_v2r8(iz1,jz1);
+            dx12             = _fjsp_sub_v2r8(ix1,jx2);
+            dy12             = _fjsp_sub_v2r8(iy1,jy2);
+            dz12             = _fjsp_sub_v2r8(iz1,jz2);
+            dx13             = _fjsp_sub_v2r8(ix1,jx3);
+            dy13             = _fjsp_sub_v2r8(iy1,jy3);
+            dz13             = _fjsp_sub_v2r8(iz1,jz3);
+            dx21             = _fjsp_sub_v2r8(ix2,jx1);
+            dy21             = _fjsp_sub_v2r8(iy2,jy1);
+            dz21             = _fjsp_sub_v2r8(iz2,jz1);
+            dx22             = _fjsp_sub_v2r8(ix2,jx2);
+            dy22             = _fjsp_sub_v2r8(iy2,jy2);
+            dz22             = _fjsp_sub_v2r8(iz2,jz2);
+            dx23             = _fjsp_sub_v2r8(ix2,jx3);
+            dy23             = _fjsp_sub_v2r8(iy2,jy3);
+            dz23             = _fjsp_sub_v2r8(iz2,jz3);
+            dx31             = _fjsp_sub_v2r8(ix3,jx1);
+            dy31             = _fjsp_sub_v2r8(iy3,jy1);
+            dz31             = _fjsp_sub_v2r8(iz3,jz1);
+            dx32             = _fjsp_sub_v2r8(ix3,jx2);
+            dy32             = _fjsp_sub_v2r8(iy3,jy2);
+            dz32             = _fjsp_sub_v2r8(iz3,jz2);
+            dx33             = _fjsp_sub_v2r8(ix3,jx3);
+            dy33             = _fjsp_sub_v2r8(iy3,jy3);
+            dz33             = _fjsp_sub_v2r8(iz3,jz3);
+
+            /* Calculate squared distance and things based on it */
+            rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+            rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+            rsq13            = gmx_fjsp_calc_rsq_v2r8(dx13,dy13,dz13);
+            rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+            rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+            rsq23            = gmx_fjsp_calc_rsq_v2r8(dx23,dy23,dz23);
+            rsq31            = gmx_fjsp_calc_rsq_v2r8(dx31,dy31,dz31);
+            rsq32            = gmx_fjsp_calc_rsq_v2r8(dx32,dy32,dz32);
+            rsq33            = gmx_fjsp_calc_rsq_v2r8(dx33,dy33,dz33);
+
+            rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+            rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+            rinv13           = gmx_fjsp_invsqrt_v2r8(rsq13);
+            rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+            rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+            rinv23           = gmx_fjsp_invsqrt_v2r8(rsq23);
+            rinv31           = gmx_fjsp_invsqrt_v2r8(rsq31);
+            rinv32           = gmx_fjsp_invsqrt_v2r8(rsq32);
+            rinv33           = gmx_fjsp_invsqrt_v2r8(rsq33);
+
+            rinvsq11         = _fjsp_mul_v2r8(rinv11,rinv11);
+            rinvsq12         = _fjsp_mul_v2r8(rinv12,rinv12);
+            rinvsq13         = _fjsp_mul_v2r8(rinv13,rinv13);
+            rinvsq21         = _fjsp_mul_v2r8(rinv21,rinv21);
+            rinvsq22         = _fjsp_mul_v2r8(rinv22,rinv22);
+            rinvsq23         = _fjsp_mul_v2r8(rinv23,rinv23);
+            rinvsq31         = _fjsp_mul_v2r8(rinv31,rinv31);
+            rinvsq32         = _fjsp_mul_v2r8(rinv32,rinv32);
+            rinvsq33         = _fjsp_mul_v2r8(rinv33,rinv33);
+
+            fjx1             = _fjsp_setzero_v2r8();
+            fjy1             = _fjsp_setzero_v2r8();
+            fjz1             = _fjsp_setzero_v2r8();
+            fjx2             = _fjsp_setzero_v2r8();
+            fjy2             = _fjsp_setzero_v2r8();
+            fjz2             = _fjsp_setzero_v2r8();
+            fjx3             = _fjsp_setzero_v2r8();
+            fjy3             = _fjsp_setzero_v2r8();
+            fjz3             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq11,rcutoff2))
+            {
+
+            r11              = _fjsp_mul_v2r8(rsq11,rinv11);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r11,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq11,_fjsp_sub_v2r8(rinv11,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq11,rinv11),_fjsp_sub_v2r8(rinvsq11,felec));
+
+            d                = _fjsp_sub_v2r8(r11,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv11,_fjsp_mul_v2r8(velec,dsw)) );
+            velec            = _fjsp_mul_v2r8(velec,sw);
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq11,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+            
+            fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq12,rcutoff2))
+            {
+
+            r12              = _fjsp_mul_v2r8(rsq12,rinv12);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r12,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq12,_fjsp_sub_v2r8(rinv12,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq12,rinv12),_fjsp_sub_v2r8(rinvsq12,felec));
+
+            d                = _fjsp_sub_v2r8(r12,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv12,_fjsp_mul_v2r8(velec,dsw)) );
+            velec            = _fjsp_mul_v2r8(velec,sw);
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq12,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+            
+            fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq13,rcutoff2))
+            {
+
+            r13              = _fjsp_mul_v2r8(rsq13,rinv13);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r13,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq13,_fjsp_sub_v2r8(rinv13,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq13,rinv13),_fjsp_sub_v2r8(rinvsq13,felec));
+
+            d                = _fjsp_sub_v2r8(r13,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv13,_fjsp_mul_v2r8(velec,dsw)) );
+            velec            = _fjsp_mul_v2r8(velec,sw);
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq13,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx13,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy13,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz13,fscal,fiz1);
+            
+            fjx3             = _fjsp_madd_v2r8(dx13,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy13,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz13,fscal,fjz3);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq21,rcutoff2))
+            {
+
+            r21              = _fjsp_mul_v2r8(rsq21,rinv21);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r21,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq21,_fjsp_sub_v2r8(rinv21,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq21,rinv21),_fjsp_sub_v2r8(rinvsq21,felec));
+
+            d                = _fjsp_sub_v2r8(r21,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv21,_fjsp_mul_v2r8(velec,dsw)) );
+            velec            = _fjsp_mul_v2r8(velec,sw);
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq21,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+            
+            fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq22,rcutoff2))
+            {
+
+            r22              = _fjsp_mul_v2r8(rsq22,rinv22);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r22,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq22,_fjsp_sub_v2r8(rinv22,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq22,rinv22),_fjsp_sub_v2r8(rinvsq22,felec));
+
+            d                = _fjsp_sub_v2r8(r22,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv22,_fjsp_mul_v2r8(velec,dsw)) );
+            velec            = _fjsp_mul_v2r8(velec,sw);
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq22,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+            
+            fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq23,rcutoff2))
+            {
+
+            r23              = _fjsp_mul_v2r8(rsq23,rinv23);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r23,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq23,_fjsp_sub_v2r8(rinv23,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq23,rinv23),_fjsp_sub_v2r8(rinvsq23,felec));
+
+            d                = _fjsp_sub_v2r8(r23,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv23,_fjsp_mul_v2r8(velec,dsw)) );
+            velec            = _fjsp_mul_v2r8(velec,sw);
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq23,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx23,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy23,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz23,fscal,fiz2);
+            
+            fjx3             = _fjsp_madd_v2r8(dx23,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy23,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz23,fscal,fjz3);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq31,rcutoff2))
+            {
+
+            r31              = _fjsp_mul_v2r8(rsq31,rinv31);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r31,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq31,_fjsp_sub_v2r8(rinv31,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq31,rinv31),_fjsp_sub_v2r8(rinvsq31,felec));
+
+            d                = _fjsp_sub_v2r8(r31,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv31,_fjsp_mul_v2r8(velec,dsw)) );
+            velec            = _fjsp_mul_v2r8(velec,sw);
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq31,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx31,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy31,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz31,fscal,fiz3);
+            
+            fjx1             = _fjsp_madd_v2r8(dx31,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy31,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz31,fscal,fjz1);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq32,rcutoff2))
+            {
+
+            r32              = _fjsp_mul_v2r8(rsq32,rinv32);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r32,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq32,_fjsp_sub_v2r8(rinv32,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq32,rinv32),_fjsp_sub_v2r8(rinvsq32,felec));
+
+            d                = _fjsp_sub_v2r8(r32,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv32,_fjsp_mul_v2r8(velec,dsw)) );
+            velec            = _fjsp_mul_v2r8(velec,sw);
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq32,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx32,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy32,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz32,fscal,fiz3);
+            
+            fjx2             = _fjsp_madd_v2r8(dx32,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy32,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz32,fscal,fjz2);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq33,rcutoff2))
+            {
+
+            r33              = _fjsp_mul_v2r8(rsq33,rinv33);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r33,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq33,_fjsp_sub_v2r8(rinv33,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq33,rinv33),_fjsp_sub_v2r8(rinvsq33,felec));
+
+            d                = _fjsp_sub_v2r8(r33,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv33,_fjsp_mul_v2r8(velec,dsw)) );
+            velec            = _fjsp_mul_v2r8(velec,sw);
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq33,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx33,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy33,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz33,fscal,fiz3);
+            
+            fjx3             = _fjsp_madd_v2r8(dx33,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy33,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz33,fscal,fjz3);
+
+            }
+
+            gmx_fjsp_decrement_3rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA+DIM,f+j_coord_offsetB+DIM,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+
+            /* Inner loop uses 612 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_3rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA+DIM,
+                                              &jx1,&jy1,&jz1,&jx2,&jy2,&jz2,&jx3,&jy3,&jz3);
+
+            /* Calculate displacement vector */
+            dx11             = _fjsp_sub_v2r8(ix1,jx1);
+            dy11             = _fjsp_sub_v2r8(iy1,jy1);
+            dz11             = _fjsp_sub_v2r8(iz1,jz1);
+            dx12             = _fjsp_sub_v2r8(ix1,jx2);
+            dy12             = _fjsp_sub_v2r8(iy1,jy2);
+            dz12             = _fjsp_sub_v2r8(iz1,jz2);
+            dx13             = _fjsp_sub_v2r8(ix1,jx3);
+            dy13             = _fjsp_sub_v2r8(iy1,jy3);
+            dz13             = _fjsp_sub_v2r8(iz1,jz3);
+            dx21             = _fjsp_sub_v2r8(ix2,jx1);
+            dy21             = _fjsp_sub_v2r8(iy2,jy1);
+            dz21             = _fjsp_sub_v2r8(iz2,jz1);
+            dx22             = _fjsp_sub_v2r8(ix2,jx2);
+            dy22             = _fjsp_sub_v2r8(iy2,jy2);
+            dz22             = _fjsp_sub_v2r8(iz2,jz2);
+            dx23             = _fjsp_sub_v2r8(ix2,jx3);
+            dy23             = _fjsp_sub_v2r8(iy2,jy3);
+            dz23             = _fjsp_sub_v2r8(iz2,jz3);
+            dx31             = _fjsp_sub_v2r8(ix3,jx1);
+            dy31             = _fjsp_sub_v2r8(iy3,jy1);
+            dz31             = _fjsp_sub_v2r8(iz3,jz1);
+            dx32             = _fjsp_sub_v2r8(ix3,jx2);
+            dy32             = _fjsp_sub_v2r8(iy3,jy2);
+            dz32             = _fjsp_sub_v2r8(iz3,jz2);
+            dx33             = _fjsp_sub_v2r8(ix3,jx3);
+            dy33             = _fjsp_sub_v2r8(iy3,jy3);
+            dz33             = _fjsp_sub_v2r8(iz3,jz3);
+
+            /* Calculate squared distance and things based on it */
+            rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+            rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+            rsq13            = gmx_fjsp_calc_rsq_v2r8(dx13,dy13,dz13);
+            rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+            rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+            rsq23            = gmx_fjsp_calc_rsq_v2r8(dx23,dy23,dz23);
+            rsq31            = gmx_fjsp_calc_rsq_v2r8(dx31,dy31,dz31);
+            rsq32            = gmx_fjsp_calc_rsq_v2r8(dx32,dy32,dz32);
+            rsq33            = gmx_fjsp_calc_rsq_v2r8(dx33,dy33,dz33);
+
+            rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+            rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+            rinv13           = gmx_fjsp_invsqrt_v2r8(rsq13);
+            rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+            rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+            rinv23           = gmx_fjsp_invsqrt_v2r8(rsq23);
+            rinv31           = gmx_fjsp_invsqrt_v2r8(rsq31);
+            rinv32           = gmx_fjsp_invsqrt_v2r8(rsq32);
+            rinv33           = gmx_fjsp_invsqrt_v2r8(rsq33);
+
+            rinvsq11         = _fjsp_mul_v2r8(rinv11,rinv11);
+            rinvsq12         = _fjsp_mul_v2r8(rinv12,rinv12);
+            rinvsq13         = _fjsp_mul_v2r8(rinv13,rinv13);
+            rinvsq21         = _fjsp_mul_v2r8(rinv21,rinv21);
+            rinvsq22         = _fjsp_mul_v2r8(rinv22,rinv22);
+            rinvsq23         = _fjsp_mul_v2r8(rinv23,rinv23);
+            rinvsq31         = _fjsp_mul_v2r8(rinv31,rinv31);
+            rinvsq32         = _fjsp_mul_v2r8(rinv32,rinv32);
+            rinvsq33         = _fjsp_mul_v2r8(rinv33,rinv33);
+
+            fjx1             = _fjsp_setzero_v2r8();
+            fjy1             = _fjsp_setzero_v2r8();
+            fjz1             = _fjsp_setzero_v2r8();
+            fjx2             = _fjsp_setzero_v2r8();
+            fjy2             = _fjsp_setzero_v2r8();
+            fjz2             = _fjsp_setzero_v2r8();
+            fjx3             = _fjsp_setzero_v2r8();
+            fjy3             = _fjsp_setzero_v2r8();
+            fjz3             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq11,rcutoff2))
+            {
+
+            r11              = _fjsp_mul_v2r8(rsq11,rinv11);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r11,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq11,_fjsp_sub_v2r8(rinv11,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq11,rinv11),_fjsp_sub_v2r8(rinvsq11,felec));
+
+            d                = _fjsp_sub_v2r8(r11,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv11,_fjsp_mul_v2r8(velec,dsw)) );
+            velec            = _fjsp_mul_v2r8(velec,sw);
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq11,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+            
+            fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq12,rcutoff2))
+            {
+
+            r12              = _fjsp_mul_v2r8(rsq12,rinv12);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r12,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq12,_fjsp_sub_v2r8(rinv12,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq12,rinv12),_fjsp_sub_v2r8(rinvsq12,felec));
+
+            d                = _fjsp_sub_v2r8(r12,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv12,_fjsp_mul_v2r8(velec,dsw)) );
+            velec            = _fjsp_mul_v2r8(velec,sw);
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq12,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+            
+            fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq13,rcutoff2))
+            {
+
+            r13              = _fjsp_mul_v2r8(rsq13,rinv13);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r13,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq13,_fjsp_sub_v2r8(rinv13,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq13,rinv13),_fjsp_sub_v2r8(rinvsq13,felec));
+
+            d                = _fjsp_sub_v2r8(r13,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv13,_fjsp_mul_v2r8(velec,dsw)) );
+            velec            = _fjsp_mul_v2r8(velec,sw);
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq13,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx13,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy13,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz13,fscal,fiz1);
+            
+            fjx3             = _fjsp_madd_v2r8(dx13,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy13,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz13,fscal,fjz3);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq21,rcutoff2))
+            {
+
+            r21              = _fjsp_mul_v2r8(rsq21,rinv21);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r21,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq21,_fjsp_sub_v2r8(rinv21,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq21,rinv21),_fjsp_sub_v2r8(rinvsq21,felec));
+
+            d                = _fjsp_sub_v2r8(r21,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv21,_fjsp_mul_v2r8(velec,dsw)) );
+            velec            = _fjsp_mul_v2r8(velec,sw);
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq21,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+            
+            fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq22,rcutoff2))
+            {
+
+            r22              = _fjsp_mul_v2r8(rsq22,rinv22);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r22,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq22,_fjsp_sub_v2r8(rinv22,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq22,rinv22),_fjsp_sub_v2r8(rinvsq22,felec));
+
+            d                = _fjsp_sub_v2r8(r22,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv22,_fjsp_mul_v2r8(velec,dsw)) );
+            velec            = _fjsp_mul_v2r8(velec,sw);
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq22,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+            
+            fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq23,rcutoff2))
+            {
+
+            r23              = _fjsp_mul_v2r8(rsq23,rinv23);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r23,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq23,_fjsp_sub_v2r8(rinv23,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq23,rinv23),_fjsp_sub_v2r8(rinvsq23,felec));
+
+            d                = _fjsp_sub_v2r8(r23,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv23,_fjsp_mul_v2r8(velec,dsw)) );
+            velec            = _fjsp_mul_v2r8(velec,sw);
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq23,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx23,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy23,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz23,fscal,fiz2);
+            
+            fjx3             = _fjsp_madd_v2r8(dx23,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy23,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz23,fscal,fjz3);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq31,rcutoff2))
+            {
+
+            r31              = _fjsp_mul_v2r8(rsq31,rinv31);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r31,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq31,_fjsp_sub_v2r8(rinv31,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq31,rinv31),_fjsp_sub_v2r8(rinvsq31,felec));
+
+            d                = _fjsp_sub_v2r8(r31,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv31,_fjsp_mul_v2r8(velec,dsw)) );
+            velec            = _fjsp_mul_v2r8(velec,sw);
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq31,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx31,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy31,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz31,fscal,fiz3);
+            
+            fjx1             = _fjsp_madd_v2r8(dx31,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy31,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz31,fscal,fjz1);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq32,rcutoff2))
+            {
+
+            r32              = _fjsp_mul_v2r8(rsq32,rinv32);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r32,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq32,_fjsp_sub_v2r8(rinv32,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq32,rinv32),_fjsp_sub_v2r8(rinvsq32,felec));
+
+            d                = _fjsp_sub_v2r8(r32,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv32,_fjsp_mul_v2r8(velec,dsw)) );
+            velec            = _fjsp_mul_v2r8(velec,sw);
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq32,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx32,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy32,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz32,fscal,fiz3);
+            
+            fjx2             = _fjsp_madd_v2r8(dx32,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy32,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz32,fscal,fjz2);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq33,rcutoff2))
+            {
+
+            r33              = _fjsp_mul_v2r8(rsq33,rinv33);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r33,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq33,_fjsp_sub_v2r8(rinv33,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq33,rinv33),_fjsp_sub_v2r8(rinvsq33,felec));
+
+            d                = _fjsp_sub_v2r8(r33,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv33,_fjsp_mul_v2r8(velec,dsw)) );
+            velec            = _fjsp_mul_v2r8(velec,sw);
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq33,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx33,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy33,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz33,fscal,fiz3);
+            
+            fjx3             = _fjsp_madd_v2r8(dx33,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy33,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz33,fscal,fjz3);
+
+            }
+
+            gmx_fjsp_decrement_3rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA+DIM,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+
+            /* Inner loop uses 612 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_fjsp_update_iforce_3atom_swizzle_v2r8(fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+                                              f+i_coord_offset+DIM,fshift+i_shift_offset);
+
+        ggid                        = gid[iidx];
+        /* Update potential energies */
+        gmx_fjsp_update_1pot_v2r8(velecsum,kernel_data->energygrp_elec+ggid);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 19 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W4W4_VF,outeriter*19 + inneriter*612);
+}
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecEwSw_VdwNone_GeomW4W4_F_sparc64_hpc_ace_double
+ * Electrostatics interaction: Ewald
+ * VdW interaction:            None
+ * Geometry:                   Water4-Water4
+ * Calculate force/pot:        Force
+ */
+void
+nb_kernel_ElecEwSw_VdwNone_GeomW4W4_F_sparc64_hpc_ace_double
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset1;
+    _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwioffset3;
+    _fjsp_v2r8       ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+    int              vdwjidx1A,vdwjidx1B;
+    _fjsp_v2r8       jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+    int              vdwjidx2A,vdwjidx2B;
+    _fjsp_v2r8       jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+    int              vdwjidx3A,vdwjidx3B;
+    _fjsp_v2r8       jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
+    _fjsp_v2r8       dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+    _fjsp_v2r8       dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+    _fjsp_v2r8       dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
+    _fjsp_v2r8       dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+    _fjsp_v2r8       dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+    _fjsp_v2r8       dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
+    _fjsp_v2r8       dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
+    _fjsp_v2r8       dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
+    _fjsp_v2r8       dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
+    _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    _fjsp_v2r8       ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+    real             *ewtab;
+    _fjsp_v2r8       rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
+    real             rswitch_scalar,d_scalar;
+    _fjsp_v2r8       itab_tmp;
+    _fjsp_v2r8       dummy_mask,cutoff_mask;
+    _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+    _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+    union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+    charge           = mdatoms->chargeA;
+
+    sh_ewald         = gmx_fjsp_set1_v2r8(fr->ic->sh_ewald);
+    ewtab            = fr->ic->tabq_coul_FDV0;
+    ewtabscale       = gmx_fjsp_set1_v2r8(fr->ic->tabq_scale);
+    ewtabhalfspace   = gmx_fjsp_set1_v2r8(0.5/fr->ic->tabq_scale);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+    iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+    iq3              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+3]));
+
+    jq1              = gmx_fjsp_set1_v2r8(charge[inr+1]);
+    jq2              = gmx_fjsp_set1_v2r8(charge[inr+2]);
+    jq3              = gmx_fjsp_set1_v2r8(charge[inr+3]);
+    qq11             = _fjsp_mul_v2r8(iq1,jq1);
+    qq12             = _fjsp_mul_v2r8(iq1,jq2);
+    qq13             = _fjsp_mul_v2r8(iq1,jq3);
+    qq21             = _fjsp_mul_v2r8(iq2,jq1);
+    qq22             = _fjsp_mul_v2r8(iq2,jq2);
+    qq23             = _fjsp_mul_v2r8(iq2,jq3);
+    qq31             = _fjsp_mul_v2r8(iq3,jq1);
+    qq32             = _fjsp_mul_v2r8(iq3,jq2);
+    qq33             = _fjsp_mul_v2r8(iq3,jq3);
+
+    /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+    rcutoff_scalar   = fr->rcoulomb;
+    rcutoff          = gmx_fjsp_set1_v2r8(rcutoff_scalar);
+    rcutoff2         = _fjsp_mul_v2r8(rcutoff,rcutoff);
+
+    rswitch_scalar   = fr->rcoulomb_switch;
+    rswitch          = gmx_fjsp_set1_v2r8(rswitch_scalar);
+    /* Setup switch parameters */
+    d_scalar         = rcutoff_scalar-rswitch_scalar;
+    d                = gmx_fjsp_set1_v2r8(d_scalar);
+    swV3             = gmx_fjsp_set1_v2r8(-10.0/(d_scalar*d_scalar*d_scalar));
+    swV4             = gmx_fjsp_set1_v2r8( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar));
+    swV5             = gmx_fjsp_set1_v2r8( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
+    swF2             = gmx_fjsp_set1_v2r8(-30.0/(d_scalar*d_scalar*d_scalar));
+    swF3             = gmx_fjsp_set1_v2r8( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar));
+    swF4             = gmx_fjsp_set1_v2r8(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_fjsp_load_shift_and_3rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset+DIM,
+                                                 &ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
+
+        fix1             = _fjsp_setzero_v2r8();
+        fiy1             = _fjsp_setzero_v2r8();
+        fiz1             = _fjsp_setzero_v2r8();
+        fix2             = _fjsp_setzero_v2r8();
+        fiy2             = _fjsp_setzero_v2r8();
+        fiz2             = _fjsp_setzero_v2r8();
+        fix3             = _fjsp_setzero_v2r8();
+        fiy3             = _fjsp_setzero_v2r8();
+        fiz3             = _fjsp_setzero_v2r8();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_3rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA+DIM,x+j_coord_offsetB+DIM,
+                                              &jx1,&jy1,&jz1,&jx2,&jy2,&jz2,&jx3,&jy3,&jz3);
+
+            /* Calculate displacement vector */
+            dx11             = _fjsp_sub_v2r8(ix1,jx1);
+            dy11             = _fjsp_sub_v2r8(iy1,jy1);
+            dz11             = _fjsp_sub_v2r8(iz1,jz1);
+            dx12             = _fjsp_sub_v2r8(ix1,jx2);
+            dy12             = _fjsp_sub_v2r8(iy1,jy2);
+            dz12             = _fjsp_sub_v2r8(iz1,jz2);
+            dx13             = _fjsp_sub_v2r8(ix1,jx3);
+            dy13             = _fjsp_sub_v2r8(iy1,jy3);
+            dz13             = _fjsp_sub_v2r8(iz1,jz3);
+            dx21             = _fjsp_sub_v2r8(ix2,jx1);
+            dy21             = _fjsp_sub_v2r8(iy2,jy1);
+            dz21             = _fjsp_sub_v2r8(iz2,jz1);
+            dx22             = _fjsp_sub_v2r8(ix2,jx2);
+            dy22             = _fjsp_sub_v2r8(iy2,jy2);
+            dz22             = _fjsp_sub_v2r8(iz2,jz2);
+            dx23             = _fjsp_sub_v2r8(ix2,jx3);
+            dy23             = _fjsp_sub_v2r8(iy2,jy3);
+            dz23             = _fjsp_sub_v2r8(iz2,jz3);
+            dx31             = _fjsp_sub_v2r8(ix3,jx1);
+            dy31             = _fjsp_sub_v2r8(iy3,jy1);
+            dz31             = _fjsp_sub_v2r8(iz3,jz1);
+            dx32             = _fjsp_sub_v2r8(ix3,jx2);
+            dy32             = _fjsp_sub_v2r8(iy3,jy2);
+            dz32             = _fjsp_sub_v2r8(iz3,jz2);
+            dx33             = _fjsp_sub_v2r8(ix3,jx3);
+            dy33             = _fjsp_sub_v2r8(iy3,jy3);
+            dz33             = _fjsp_sub_v2r8(iz3,jz3);
+
+            /* Calculate squared distance and things based on it */
+            rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+            rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+            rsq13            = gmx_fjsp_calc_rsq_v2r8(dx13,dy13,dz13);
+            rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+            rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+            rsq23            = gmx_fjsp_calc_rsq_v2r8(dx23,dy23,dz23);
+            rsq31            = gmx_fjsp_calc_rsq_v2r8(dx31,dy31,dz31);
+            rsq32            = gmx_fjsp_calc_rsq_v2r8(dx32,dy32,dz32);
+            rsq33            = gmx_fjsp_calc_rsq_v2r8(dx33,dy33,dz33);
+
+            rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+            rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+            rinv13           = gmx_fjsp_invsqrt_v2r8(rsq13);
+            rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+            rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+            rinv23           = gmx_fjsp_invsqrt_v2r8(rsq23);
+            rinv31           = gmx_fjsp_invsqrt_v2r8(rsq31);
+            rinv32           = gmx_fjsp_invsqrt_v2r8(rsq32);
+            rinv33           = gmx_fjsp_invsqrt_v2r8(rsq33);
+
+            rinvsq11         = _fjsp_mul_v2r8(rinv11,rinv11);
+            rinvsq12         = _fjsp_mul_v2r8(rinv12,rinv12);
+            rinvsq13         = _fjsp_mul_v2r8(rinv13,rinv13);
+            rinvsq21         = _fjsp_mul_v2r8(rinv21,rinv21);
+            rinvsq22         = _fjsp_mul_v2r8(rinv22,rinv22);
+            rinvsq23         = _fjsp_mul_v2r8(rinv23,rinv23);
+            rinvsq31         = _fjsp_mul_v2r8(rinv31,rinv31);
+            rinvsq32         = _fjsp_mul_v2r8(rinv32,rinv32);
+            rinvsq33         = _fjsp_mul_v2r8(rinv33,rinv33);
+
+            fjx1             = _fjsp_setzero_v2r8();
+            fjy1             = _fjsp_setzero_v2r8();
+            fjz1             = _fjsp_setzero_v2r8();
+            fjx2             = _fjsp_setzero_v2r8();
+            fjy2             = _fjsp_setzero_v2r8();
+            fjz2             = _fjsp_setzero_v2r8();
+            fjx3             = _fjsp_setzero_v2r8();
+            fjy3             = _fjsp_setzero_v2r8();
+            fjz3             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq11,rcutoff2))
+            {
+
+            r11              = _fjsp_mul_v2r8(rsq11,rinv11);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r11,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq11,_fjsp_sub_v2r8(rinv11,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq11,rinv11),_fjsp_sub_v2r8(rinvsq11,felec));
+
+            d                = _fjsp_sub_v2r8(r11,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv11,_fjsp_mul_v2r8(velec,dsw)) );
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq11,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+            
+            fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq12,rcutoff2))
+            {
+
+            r12              = _fjsp_mul_v2r8(rsq12,rinv12);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r12,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq12,_fjsp_sub_v2r8(rinv12,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq12,rinv12),_fjsp_sub_v2r8(rinvsq12,felec));
+
+            d                = _fjsp_sub_v2r8(r12,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv12,_fjsp_mul_v2r8(velec,dsw)) );
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq12,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+            
+            fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq13,rcutoff2))
+            {
+
+            r13              = _fjsp_mul_v2r8(rsq13,rinv13);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r13,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq13,_fjsp_sub_v2r8(rinv13,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq13,rinv13),_fjsp_sub_v2r8(rinvsq13,felec));
+
+            d                = _fjsp_sub_v2r8(r13,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv13,_fjsp_mul_v2r8(velec,dsw)) );
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq13,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx13,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy13,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz13,fscal,fiz1);
+            
+            fjx3             = _fjsp_madd_v2r8(dx13,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy13,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz13,fscal,fjz3);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq21,rcutoff2))
+            {
+
+            r21              = _fjsp_mul_v2r8(rsq21,rinv21);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r21,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq21,_fjsp_sub_v2r8(rinv21,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq21,rinv21),_fjsp_sub_v2r8(rinvsq21,felec));
+
+            d                = _fjsp_sub_v2r8(r21,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv21,_fjsp_mul_v2r8(velec,dsw)) );
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq21,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+            
+            fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq22,rcutoff2))
+            {
+
+            r22              = _fjsp_mul_v2r8(rsq22,rinv22);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r22,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq22,_fjsp_sub_v2r8(rinv22,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq22,rinv22),_fjsp_sub_v2r8(rinvsq22,felec));
+
+            d                = _fjsp_sub_v2r8(r22,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv22,_fjsp_mul_v2r8(velec,dsw)) );
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq22,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+            
+            fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq23,rcutoff2))
+            {
+
+            r23              = _fjsp_mul_v2r8(rsq23,rinv23);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r23,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq23,_fjsp_sub_v2r8(rinv23,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq23,rinv23),_fjsp_sub_v2r8(rinvsq23,felec));
+
+            d                = _fjsp_sub_v2r8(r23,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv23,_fjsp_mul_v2r8(velec,dsw)) );
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq23,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx23,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy23,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz23,fscal,fiz2);
+            
+            fjx3             = _fjsp_madd_v2r8(dx23,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy23,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz23,fscal,fjz3);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq31,rcutoff2))
+            {
+
+            r31              = _fjsp_mul_v2r8(rsq31,rinv31);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r31,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq31,_fjsp_sub_v2r8(rinv31,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq31,rinv31),_fjsp_sub_v2r8(rinvsq31,felec));
+
+            d                = _fjsp_sub_v2r8(r31,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv31,_fjsp_mul_v2r8(velec,dsw)) );
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq31,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx31,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy31,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz31,fscal,fiz3);
+            
+            fjx1             = _fjsp_madd_v2r8(dx31,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy31,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz31,fscal,fjz1);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq32,rcutoff2))
+            {
+
+            r32              = _fjsp_mul_v2r8(rsq32,rinv32);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r32,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq32,_fjsp_sub_v2r8(rinv32,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq32,rinv32),_fjsp_sub_v2r8(rinvsq32,felec));
+
+            d                = _fjsp_sub_v2r8(r32,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv32,_fjsp_mul_v2r8(velec,dsw)) );
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq32,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx32,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy32,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz32,fscal,fiz3);
+            
+            fjx2             = _fjsp_madd_v2r8(dx32,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy32,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz32,fscal,fjz2);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq33,rcutoff2))
+            {
+
+            r33              = _fjsp_mul_v2r8(rsq33,rinv33);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r33,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq33,_fjsp_sub_v2r8(rinv33,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq33,rinv33),_fjsp_sub_v2r8(rinvsq33,felec));
+
+            d                = _fjsp_sub_v2r8(r33,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv33,_fjsp_mul_v2r8(velec,dsw)) );
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq33,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx33,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy33,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz33,fscal,fiz3);
+            
+            fjx3             = _fjsp_madd_v2r8(dx33,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy33,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz33,fscal,fjz3);
+
+            }
+
+            gmx_fjsp_decrement_3rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA+DIM,f+j_coord_offsetB+DIM,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+
+            /* Inner loop uses 585 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_3rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA+DIM,
+                                              &jx1,&jy1,&jz1,&jx2,&jy2,&jz2,&jx3,&jy3,&jz3);
+
+            /* Calculate displacement vector */
+            dx11             = _fjsp_sub_v2r8(ix1,jx1);
+            dy11             = _fjsp_sub_v2r8(iy1,jy1);
+            dz11             = _fjsp_sub_v2r8(iz1,jz1);
+            dx12             = _fjsp_sub_v2r8(ix1,jx2);
+            dy12             = _fjsp_sub_v2r8(iy1,jy2);
+            dz12             = _fjsp_sub_v2r8(iz1,jz2);
+            dx13             = _fjsp_sub_v2r8(ix1,jx3);
+            dy13             = _fjsp_sub_v2r8(iy1,jy3);
+            dz13             = _fjsp_sub_v2r8(iz1,jz3);
+            dx21             = _fjsp_sub_v2r8(ix2,jx1);
+            dy21             = _fjsp_sub_v2r8(iy2,jy1);
+            dz21             = _fjsp_sub_v2r8(iz2,jz1);
+            dx22             = _fjsp_sub_v2r8(ix2,jx2);
+            dy22             = _fjsp_sub_v2r8(iy2,jy2);
+            dz22             = _fjsp_sub_v2r8(iz2,jz2);
+            dx23             = _fjsp_sub_v2r8(ix2,jx3);
+            dy23             = _fjsp_sub_v2r8(iy2,jy3);
+            dz23             = _fjsp_sub_v2r8(iz2,jz3);
+            dx31             = _fjsp_sub_v2r8(ix3,jx1);
+            dy31             = _fjsp_sub_v2r8(iy3,jy1);
+            dz31             = _fjsp_sub_v2r8(iz3,jz1);
+            dx32             = _fjsp_sub_v2r8(ix3,jx2);
+            dy32             = _fjsp_sub_v2r8(iy3,jy2);
+            dz32             = _fjsp_sub_v2r8(iz3,jz2);
+            dx33             = _fjsp_sub_v2r8(ix3,jx3);
+            dy33             = _fjsp_sub_v2r8(iy3,jy3);
+            dz33             = _fjsp_sub_v2r8(iz3,jz3);
+
+            /* Calculate squared distance and things based on it */
+            rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+            rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+            rsq13            = gmx_fjsp_calc_rsq_v2r8(dx13,dy13,dz13);
+            rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+            rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+            rsq23            = gmx_fjsp_calc_rsq_v2r8(dx23,dy23,dz23);
+            rsq31            = gmx_fjsp_calc_rsq_v2r8(dx31,dy31,dz31);
+            rsq32            = gmx_fjsp_calc_rsq_v2r8(dx32,dy32,dz32);
+            rsq33            = gmx_fjsp_calc_rsq_v2r8(dx33,dy33,dz33);
+
+            rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+            rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+            rinv13           = gmx_fjsp_invsqrt_v2r8(rsq13);
+            rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+            rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+            rinv23           = gmx_fjsp_invsqrt_v2r8(rsq23);
+            rinv31           = gmx_fjsp_invsqrt_v2r8(rsq31);
+            rinv32           = gmx_fjsp_invsqrt_v2r8(rsq32);
+            rinv33           = gmx_fjsp_invsqrt_v2r8(rsq33);
+
+            rinvsq11         = _fjsp_mul_v2r8(rinv11,rinv11);
+            rinvsq12         = _fjsp_mul_v2r8(rinv12,rinv12);
+            rinvsq13         = _fjsp_mul_v2r8(rinv13,rinv13);
+            rinvsq21         = _fjsp_mul_v2r8(rinv21,rinv21);
+            rinvsq22         = _fjsp_mul_v2r8(rinv22,rinv22);
+            rinvsq23         = _fjsp_mul_v2r8(rinv23,rinv23);
+            rinvsq31         = _fjsp_mul_v2r8(rinv31,rinv31);
+            rinvsq32         = _fjsp_mul_v2r8(rinv32,rinv32);
+            rinvsq33         = _fjsp_mul_v2r8(rinv33,rinv33);
+
+            fjx1             = _fjsp_setzero_v2r8();
+            fjy1             = _fjsp_setzero_v2r8();
+            fjz1             = _fjsp_setzero_v2r8();
+            fjx2             = _fjsp_setzero_v2r8();
+            fjy2             = _fjsp_setzero_v2r8();
+            fjz2             = _fjsp_setzero_v2r8();
+            fjx3             = _fjsp_setzero_v2r8();
+            fjy3             = _fjsp_setzero_v2r8();
+            fjz3             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq11,rcutoff2))
+            {
+
+            r11              = _fjsp_mul_v2r8(rsq11,rinv11);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r11,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq11,_fjsp_sub_v2r8(rinv11,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq11,rinv11),_fjsp_sub_v2r8(rinvsq11,felec));
+
+            d                = _fjsp_sub_v2r8(r11,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv11,_fjsp_mul_v2r8(velec,dsw)) );
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq11,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+            
+            fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq12,rcutoff2))
+            {
+
+            r12              = _fjsp_mul_v2r8(rsq12,rinv12);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r12,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq12,_fjsp_sub_v2r8(rinv12,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq12,rinv12),_fjsp_sub_v2r8(rinvsq12,felec));
+
+            d                = _fjsp_sub_v2r8(r12,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv12,_fjsp_mul_v2r8(velec,dsw)) );
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq12,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+            
+            fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq13,rcutoff2))
+            {
+
+            r13              = _fjsp_mul_v2r8(rsq13,rinv13);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r13,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq13,_fjsp_sub_v2r8(rinv13,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq13,rinv13),_fjsp_sub_v2r8(rinvsq13,felec));
+
+            d                = _fjsp_sub_v2r8(r13,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv13,_fjsp_mul_v2r8(velec,dsw)) );
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq13,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx13,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy13,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz13,fscal,fiz1);
+            
+            fjx3             = _fjsp_madd_v2r8(dx13,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy13,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz13,fscal,fjz3);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq21,rcutoff2))
+            {
+
+            r21              = _fjsp_mul_v2r8(rsq21,rinv21);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r21,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq21,_fjsp_sub_v2r8(rinv21,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq21,rinv21),_fjsp_sub_v2r8(rinvsq21,felec));
+
+            d                = _fjsp_sub_v2r8(r21,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv21,_fjsp_mul_v2r8(velec,dsw)) );
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq21,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+            
+            fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq22,rcutoff2))
+            {
+
+            r22              = _fjsp_mul_v2r8(rsq22,rinv22);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r22,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq22,_fjsp_sub_v2r8(rinv22,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq22,rinv22),_fjsp_sub_v2r8(rinvsq22,felec));
+
+            d                = _fjsp_sub_v2r8(r22,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv22,_fjsp_mul_v2r8(velec,dsw)) );
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq22,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+            
+            fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq23,rcutoff2))
+            {
+
+            r23              = _fjsp_mul_v2r8(rsq23,rinv23);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r23,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq23,_fjsp_sub_v2r8(rinv23,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq23,rinv23),_fjsp_sub_v2r8(rinvsq23,felec));
+
+            d                = _fjsp_sub_v2r8(r23,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv23,_fjsp_mul_v2r8(velec,dsw)) );
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq23,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx23,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy23,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz23,fscal,fiz2);
+            
+            fjx3             = _fjsp_madd_v2r8(dx23,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy23,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz23,fscal,fjz3);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq31,rcutoff2))
+            {
+
+            r31              = _fjsp_mul_v2r8(rsq31,rinv31);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r31,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq31,_fjsp_sub_v2r8(rinv31,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq31,rinv31),_fjsp_sub_v2r8(rinvsq31,felec));
+
+            d                = _fjsp_sub_v2r8(r31,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv31,_fjsp_mul_v2r8(velec,dsw)) );
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq31,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx31,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy31,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz31,fscal,fiz3);
+            
+            fjx1             = _fjsp_madd_v2r8(dx31,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy31,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz31,fscal,fjz1);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq32,rcutoff2))
+            {
+
+            r32              = _fjsp_mul_v2r8(rsq32,rinv32);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r32,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq32,_fjsp_sub_v2r8(rinv32,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq32,rinv32),_fjsp_sub_v2r8(rinvsq32,felec));
+
+            d                = _fjsp_sub_v2r8(r32,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv32,_fjsp_mul_v2r8(velec,dsw)) );
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq32,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx32,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy32,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz32,fscal,fiz3);
+            
+            fjx2             = _fjsp_madd_v2r8(dx32,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy32,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz32,fscal,fjz2);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq33,rcutoff2))
+            {
+
+            r33              = _fjsp_mul_v2r8(rsq33,rinv33);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r33,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq33,_fjsp_sub_v2r8(rinv33,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq33,rinv33),_fjsp_sub_v2r8(rinvsq33,felec));
+
+            d                = _fjsp_sub_v2r8(r33,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv33,_fjsp_mul_v2r8(velec,dsw)) );
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq33,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx33,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy33,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz33,fscal,fiz3);
+            
+            fjx3             = _fjsp_madd_v2r8(dx33,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy33,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz33,fscal,fjz3);
+
+            }
+
+            gmx_fjsp_decrement_3rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA+DIM,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+
+            /* Inner loop uses 585 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_fjsp_update_iforce_3atom_swizzle_v2r8(fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+                                              f+i_coord_offset+DIM,fshift+i_shift_offset);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 18 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W4W4_F,outeriter*18 + inneriter*585);
+}
diff --git a/src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecEw_VdwCSTab_GeomP1P1_sparc64_hpc_ace_double.c b/src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecEw_VdwCSTab_GeomP1P1_sparc64_hpc_ace_double.c
new file mode 100644 (file)
index 0000000..3f30f96
--- /dev/null
@@ -0,0 +1,740 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2012, by the GROMACS development team, led by
+ * David van der Spoel, Berk Hess, Erik Lindahl, and including many
+ * others, as listed in the AUTHORS file in the top-level source
+ * directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*
+ * Note: this file was generated by the GROMACS sparc64_hpc_ace_double kernel generator.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "kernelutil_sparc64_hpc_ace_double.h"
+
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecEw_VdwCSTab_GeomP1P1_VF_sparc64_hpc_ace_double
+ * Electrostatics interaction: Ewald
+ * VdW interaction:            CubicSplineTable
+ * Geometry:                   Particle-Particle
+ * Calculate force/pot:        PotentialAndForce
+ */
+void
+nb_kernel_ElecEw_VdwCSTab_GeomP1P1_VF_sparc64_hpc_ace_double
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwjidx0A,vdwjidx0B;
+    _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
+    _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
+    _fjsp_v2r8       rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF,twovfeps;
+    real             *vftab;
+    _fjsp_v2r8       ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+    real             *ewtab;
+    _fjsp_v2r8       itab_tmp;
+    _fjsp_v2r8       dummy_mask,cutoff_mask;
+    _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+    _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+    union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+
+    vftab            = kernel_data->table_vdw->data;
+    vftabscale       = gmx_fjsp_set1_v2r8(kernel_data->table_vdw->scale);
+
+    sh_ewald         = gmx_fjsp_set1_v2r8(fr->ic->sh_ewald);
+    ewtab            = fr->ic->tabq_coul_FDV0;
+    ewtabscale       = gmx_fjsp_set1_v2r8(fr->ic->tabq_scale);
+    ewtabhalfspace   = gmx_fjsp_set1_v2r8(0.5/fr->ic->tabq_scale);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_fjsp_load_shift_and_1rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,&ix0,&iy0,&iz0);
+
+        fix0             = _fjsp_setzero_v2r8();
+        fiy0             = _fjsp_setzero_v2r8();
+        fiz0             = _fjsp_setzero_v2r8();
+
+        /* Load parameters for i particles */
+        iq0              = _fjsp_mul_v2r8(facel,gmx_fjsp_load1_v2r8(charge+inr+0));
+        vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+        /* Reset potential sums */
+        velecsum         = _fjsp_setzero_v2r8();
+        vvdwsum          = _fjsp_setzero_v2r8();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_fjsp_load_2real_swizzle_v2r8(charge+jnrA+0,charge+jnrB+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _fjsp_mul_v2r8(iq0,jq0);
+            gmx_fjsp_load_2pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,&c6_00,&c12_00);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r00,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 8;
+            vfconv.i[1]     *= 8;
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r00,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq00,_fjsp_sub_v2r8(rinv00,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq00,rinv00),_fjsp_sub_v2r8(rinvsq00,felec));
+
+            /* CUBIC SPLINE TABLE DISPERSION */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 2 );
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 2 );
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            vvdw6            = _fjsp_mul_v2r8(c6_00,VV);
+            FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+            fvdw6            = _fjsp_mul_v2r8(c6_00,FF);
+
+            /* CUBIC SPLINE TABLE REPULSION */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 4 );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 4 );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 6 );
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 6 );
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            vvdw12           = _fjsp_mul_v2r8(c12_00,VV);
+            FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+            fvdw12           = _fjsp_mul_v2r8(c12_00,FF);
+            vvdw             = _fjsp_add_v2r8(vvdw12,vvdw6);
+            fvdw             = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_add_v2r8(fvdw6,fvdw12),_fjsp_mul_v2r8(vftabscale,rinv00)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+            vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
+
+            fscal            = _fjsp_add_v2r8(felec,fvdw);
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            gmx_fjsp_decrement_fma_1rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fscal,dx00,dy00,dz00);
+
+            /* Inner loop uses 78 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+
+            /* Load parameters for j particles */
+            jq0              = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),charge+jnrA+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _fjsp_mul_v2r8(iq0,jq0);
+            gmx_fjsp_load_1pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,&c6_00,&c12_00);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r00,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 8;
+            vfconv.i[1]     *= 8;
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r00,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq00,_fjsp_sub_v2r8(rinv00,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq00,rinv00),_fjsp_sub_v2r8(rinvsq00,felec));
+
+            /* CUBIC SPLINE TABLE DISPERSION */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 2 );
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            vvdw6            = _fjsp_mul_v2r8(c6_00,VV);
+            FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+            fvdw6            = _fjsp_mul_v2r8(c6_00,FF);
+
+            /* CUBIC SPLINE TABLE REPULSION */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 4 );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 6 );
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            vvdw12           = _fjsp_mul_v2r8(c12_00,VV);
+            FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+            fvdw12           = _fjsp_mul_v2r8(c12_00,FF);
+            vvdw             = _fjsp_add_v2r8(vvdw12,vvdw6);
+            fvdw             = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_add_v2r8(fvdw6,fvdw12),_fjsp_mul_v2r8(vftabscale,rinv00)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+            vvdw             = _fjsp_unpacklo_v2r8(vvdw,_fjsp_setzero_v2r8());
+            vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
+
+            fscal            = _fjsp_add_v2r8(felec,fvdw);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            gmx_fjsp_decrement_fma_1rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fscal,dx00,dy00,dz00);
+
+            /* Inner loop uses 78 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_fjsp_update_iforce_1atom_swizzle_v2r8(fix0,fiy0,fiz0,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        ggid                        = gid[iidx];
+        /* Update potential energies */
+        gmx_fjsp_update_1pot_v2r8(velecsum,kernel_data->energygrp_elec+ggid);
+        gmx_fjsp_update_1pot_v2r8(vvdwsum,kernel_data->energygrp_vdw+ggid);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 9 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_VF,outeriter*9 + inneriter*78);
+}
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecEw_VdwCSTab_GeomP1P1_F_sparc64_hpc_ace_double
+ * Electrostatics interaction: Ewald
+ * VdW interaction:            CubicSplineTable
+ * Geometry:                   Particle-Particle
+ * Calculate force/pot:        Force
+ */
+void
+nb_kernel_ElecEw_VdwCSTab_GeomP1P1_F_sparc64_hpc_ace_double
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwjidx0A,vdwjidx0B;
+    _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
+    _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
+    _fjsp_v2r8       rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF,twovfeps;
+    real             *vftab;
+    _fjsp_v2r8       ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+    real             *ewtab;
+    _fjsp_v2r8       itab_tmp;
+    _fjsp_v2r8       dummy_mask,cutoff_mask;
+    _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+    _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+    union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+
+    vftab            = kernel_data->table_vdw->data;
+    vftabscale       = gmx_fjsp_set1_v2r8(kernel_data->table_vdw->scale);
+
+    sh_ewald         = gmx_fjsp_set1_v2r8(fr->ic->sh_ewald);
+    ewtab            = fr->ic->tabq_coul_F;
+    ewtabscale       = gmx_fjsp_set1_v2r8(fr->ic->tabq_scale);
+    ewtabhalfspace   = gmx_fjsp_set1_v2r8(0.5/fr->ic->tabq_scale);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_fjsp_load_shift_and_1rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,&ix0,&iy0,&iz0);
+
+        fix0             = _fjsp_setzero_v2r8();
+        fiy0             = _fjsp_setzero_v2r8();
+        fiz0             = _fjsp_setzero_v2r8();
+
+        /* Load parameters for i particles */
+        iq0              = _fjsp_mul_v2r8(facel,gmx_fjsp_load1_v2r8(charge+inr+0));
+        vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_fjsp_load_2real_swizzle_v2r8(charge+jnrA+0,charge+jnrB+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _fjsp_mul_v2r8(iq0,jq0);
+            gmx_fjsp_load_2pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,&c6_00,&c12_00);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r00,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 8;
+            vfconv.i[1]     *= 8;
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r00,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
+                                         &ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq00,rinv00),_fjsp_sub_v2r8(rinvsq00,felec));
+
+            /* CUBIC SPLINE TABLE DISPERSION */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 2 );
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 2 );
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+            FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+            fvdw6            = _fjsp_mul_v2r8(c6_00,FF);
+
+            /* CUBIC SPLINE TABLE REPULSION */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 4 );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 4 );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 6 );
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 6 );
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+            FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+            fvdw12           = _fjsp_mul_v2r8(c12_00,FF);
+            fvdw             = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_add_v2r8(fvdw6,fvdw12),_fjsp_mul_v2r8(vftabscale,rinv00)));
+
+            fscal            = _fjsp_add_v2r8(felec,fvdw);
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            gmx_fjsp_decrement_fma_1rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fscal,dx00,dy00,dz00);
+
+            /* Inner loop uses 65 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+
+            /* Load parameters for j particles */
+            jq0              = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),charge+jnrA+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _fjsp_mul_v2r8(iq0,jq0);
+            gmx_fjsp_load_1pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,&c6_00,&c12_00);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r00,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 8;
+            vfconv.i[1]     *= 8;
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r00,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq00,rinv00),_fjsp_sub_v2r8(rinvsq00,felec));
+
+            /* CUBIC SPLINE TABLE DISPERSION */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 2 );
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+            FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+            fvdw6            = _fjsp_mul_v2r8(c6_00,FF);
+
+            /* CUBIC SPLINE TABLE REPULSION */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 4 );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 6 );
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+            FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+            fvdw12           = _fjsp_mul_v2r8(c12_00,FF);
+            fvdw             = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_add_v2r8(fvdw6,fvdw12),_fjsp_mul_v2r8(vftabscale,rinv00)));
+
+            fscal            = _fjsp_add_v2r8(felec,fvdw);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            gmx_fjsp_decrement_fma_1rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fscal,dx00,dy00,dz00);
+
+            /* Inner loop uses 65 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_fjsp_update_iforce_1atom_swizzle_v2r8(fix0,fiy0,fiz0,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 7 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_F,outeriter*7 + inneriter*65);
+}
diff --git a/src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecEw_VdwCSTab_GeomW3P1_sparc64_hpc_ace_double.c b/src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecEw_VdwCSTab_GeomW3P1_sparc64_hpc_ace_double.c
new file mode 100644 (file)
index 0000000..2ebbb0d
--- /dev/null
@@ -0,0 +1,1160 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2012, by the GROMACS development team, led by
+ * David van der Spoel, Berk Hess, Erik Lindahl, and including many
+ * others, as listed in the AUTHORS file in the top-level source
+ * directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*
+ * Note: this file was generated by the GROMACS sparc64_hpc_ace_double kernel generator.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "kernelutil_sparc64_hpc_ace_double.h"
+
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecEw_VdwCSTab_GeomW3P1_VF_sparc64_hpc_ace_double
+ * Electrostatics interaction: Ewald
+ * VdW interaction:            CubicSplineTable
+ * Geometry:                   Water3-Particle
+ * Calculate force/pot:        PotentialAndForce
+ */
+void
+nb_kernel_ElecEw_VdwCSTab_GeomW3P1_VF_sparc64_hpc_ace_double
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwjidx0A,vdwjidx0B;
+    _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    _fjsp_v2r8       dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    _fjsp_v2r8       dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
+    _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
+    _fjsp_v2r8       rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF,twovfeps;
+    real             *vftab;
+    _fjsp_v2r8       ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+    real             *ewtab;
+    _fjsp_v2r8       itab_tmp;
+    _fjsp_v2r8       dummy_mask,cutoff_mask;
+    _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+    _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+    union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+
+    vftab            = kernel_data->table_vdw->data;
+    vftabscale       = gmx_fjsp_set1_v2r8(kernel_data->table_vdw->scale);
+
+    sh_ewald         = gmx_fjsp_set1_v2r8(fr->ic->sh_ewald);
+    ewtab            = fr->ic->tabq_coul_FDV0;
+    ewtabscale       = gmx_fjsp_set1_v2r8(fr->ic->tabq_scale);
+    ewtabhalfspace   = gmx_fjsp_set1_v2r8(0.5/fr->ic->tabq_scale);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq0              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+0]));
+    iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+    iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+    vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_fjsp_load_shift_and_3rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
+
+        fix0             = _fjsp_setzero_v2r8();
+        fiy0             = _fjsp_setzero_v2r8();
+        fiz0             = _fjsp_setzero_v2r8();
+        fix1             = _fjsp_setzero_v2r8();
+        fiy1             = _fjsp_setzero_v2r8();
+        fiz1             = _fjsp_setzero_v2r8();
+        fix2             = _fjsp_setzero_v2r8();
+        fiy2             = _fjsp_setzero_v2r8();
+        fiz2             = _fjsp_setzero_v2r8();
+
+        /* Reset potential sums */
+        velecsum         = _fjsp_setzero_v2r8();
+        vvdwsum          = _fjsp_setzero_v2r8();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+            rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+            rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_fjsp_load_2real_swizzle_v2r8(charge+jnrA+0,charge+jnrB+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _fjsp_mul_v2r8(iq0,jq0);
+            gmx_fjsp_load_2pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,&c6_00,&c12_00);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r00,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 8;
+            vfconv.i[1]     *= 8;
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r00,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq00,_fjsp_sub_v2r8(rinv00,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq00,rinv00),_fjsp_sub_v2r8(rinvsq00,felec));
+
+            /* CUBIC SPLINE TABLE DISPERSION */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 2 );
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 2 );
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            vvdw6            = _fjsp_mul_v2r8(c6_00,VV);
+            FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+            fvdw6            = _fjsp_mul_v2r8(c6_00,FF);
+
+            /* CUBIC SPLINE TABLE REPULSION */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 4 );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 4 );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 6 );
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 6 );
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            vvdw12           = _fjsp_mul_v2r8(c12_00,VV);
+            FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+            fvdw12           = _fjsp_mul_v2r8(c12_00,FF);
+            vvdw             = _fjsp_add_v2r8(vvdw12,vvdw6);
+            fvdw             = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_add_v2r8(fvdw6,fvdw12),_fjsp_mul_v2r8(vftabscale,rinv00)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+            vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
+
+            fscal            = _fjsp_add_v2r8(felec,fvdw);
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r10              = _fjsp_mul_v2r8(rsq10,rinv10);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _fjsp_mul_v2r8(iq1,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r10,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq10,_fjsp_sub_v2r8(rinv10,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq10,rinv10),_fjsp_sub_v2r8(rinvsq10,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r20              = _fjsp_mul_v2r8(rsq20,rinv20);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _fjsp_mul_v2r8(iq2,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r20,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq20,_fjsp_sub_v2r8(rinv20,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq20,rinv20),_fjsp_sub_v2r8(rinvsq20,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            gmx_fjsp_decrement_1rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 169 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+            rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+            rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+
+            /* Load parameters for j particles */
+            jq0              = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),charge+jnrA+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _fjsp_mul_v2r8(iq0,jq0);
+            gmx_fjsp_load_1pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,&c6_00,&c12_00);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r00,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 8;
+            vfconv.i[1]     *= 8;
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r00,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq00,_fjsp_sub_v2r8(rinv00,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq00,rinv00),_fjsp_sub_v2r8(rinvsq00,felec));
+
+            /* CUBIC SPLINE TABLE DISPERSION */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 2 );
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            vvdw6            = _fjsp_mul_v2r8(c6_00,VV);
+            FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+            fvdw6            = _fjsp_mul_v2r8(c6_00,FF);
+
+            /* CUBIC SPLINE TABLE REPULSION */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 4 );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 6 );
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            vvdw12           = _fjsp_mul_v2r8(c12_00,VV);
+            FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+            fvdw12           = _fjsp_mul_v2r8(c12_00,FF);
+            vvdw             = _fjsp_add_v2r8(vvdw12,vvdw6);
+            fvdw             = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_add_v2r8(fvdw6,fvdw12),_fjsp_mul_v2r8(vftabscale,rinv00)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+            vvdw             = _fjsp_unpacklo_v2r8(vvdw,_fjsp_setzero_v2r8());
+            vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
+
+            fscal            = _fjsp_add_v2r8(felec,fvdw);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r10              = _fjsp_mul_v2r8(rsq10,rinv10);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _fjsp_mul_v2r8(iq1,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r10,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq10,_fjsp_sub_v2r8(rinv10,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq10,rinv10),_fjsp_sub_v2r8(rinvsq10,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r20              = _fjsp_mul_v2r8(rsq20,rinv20);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _fjsp_mul_v2r8(iq2,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r20,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq20,_fjsp_sub_v2r8(rinv20,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq20,rinv20),_fjsp_sub_v2r8(rinvsq20,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            gmx_fjsp_decrement_1rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 169 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_fjsp_update_iforce_3atom_swizzle_v2r8(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        ggid                        = gid[iidx];
+        /* Update potential energies */
+        gmx_fjsp_update_1pot_v2r8(velecsum,kernel_data->energygrp_elec+ggid);
+        gmx_fjsp_update_1pot_v2r8(vvdwsum,kernel_data->energygrp_vdw+ggid);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 20 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3_VF,outeriter*20 + inneriter*169);
+}
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecEw_VdwCSTab_GeomW3P1_F_sparc64_hpc_ace_double
+ * Electrostatics interaction: Ewald
+ * VdW interaction:            CubicSplineTable
+ * Geometry:                   Water3-Particle
+ * Calculate force/pot:        Force
+ */
+void
+nb_kernel_ElecEw_VdwCSTab_GeomW3P1_F_sparc64_hpc_ace_double
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwjidx0A,vdwjidx0B;
+    _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    _fjsp_v2r8       dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    _fjsp_v2r8       dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
+    _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
+    _fjsp_v2r8       rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF,twovfeps;
+    real             *vftab;
+    _fjsp_v2r8       ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+    real             *ewtab;
+    _fjsp_v2r8       itab_tmp;
+    _fjsp_v2r8       dummy_mask,cutoff_mask;
+    _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+    _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+    union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+
+    vftab            = kernel_data->table_vdw->data;
+    vftabscale       = gmx_fjsp_set1_v2r8(kernel_data->table_vdw->scale);
+
+    sh_ewald         = gmx_fjsp_set1_v2r8(fr->ic->sh_ewald);
+    ewtab            = fr->ic->tabq_coul_F;
+    ewtabscale       = gmx_fjsp_set1_v2r8(fr->ic->tabq_scale);
+    ewtabhalfspace   = gmx_fjsp_set1_v2r8(0.5/fr->ic->tabq_scale);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq0              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+0]));
+    iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+    iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+    vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_fjsp_load_shift_and_3rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
+
+        fix0             = _fjsp_setzero_v2r8();
+        fiy0             = _fjsp_setzero_v2r8();
+        fiz0             = _fjsp_setzero_v2r8();
+        fix1             = _fjsp_setzero_v2r8();
+        fiy1             = _fjsp_setzero_v2r8();
+        fiz1             = _fjsp_setzero_v2r8();
+        fix2             = _fjsp_setzero_v2r8();
+        fiy2             = _fjsp_setzero_v2r8();
+        fiz2             = _fjsp_setzero_v2r8();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+            rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+            rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_fjsp_load_2real_swizzle_v2r8(charge+jnrA+0,charge+jnrB+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _fjsp_mul_v2r8(iq0,jq0);
+            gmx_fjsp_load_2pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,&c6_00,&c12_00);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r00,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 8;
+            vfconv.i[1]     *= 8;
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r00,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
+                                         &ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq00,rinv00),_fjsp_sub_v2r8(rinvsq00,felec));
+
+            /* CUBIC SPLINE TABLE DISPERSION */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 2 );
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 2 );
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+            FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+            fvdw6            = _fjsp_mul_v2r8(c6_00,FF);
+
+            /* CUBIC SPLINE TABLE REPULSION */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 4 );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 4 );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 6 );
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 6 );
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+            FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+            fvdw12           = _fjsp_mul_v2r8(c12_00,FF);
+            fvdw             = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_add_v2r8(fvdw6,fvdw12),_fjsp_mul_v2r8(vftabscale,rinv00)));
+
+            fscal            = _fjsp_add_v2r8(felec,fvdw);
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r10              = _fjsp_mul_v2r8(rsq10,rinv10);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _fjsp_mul_v2r8(iq1,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r10,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
+                                         &ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq10,rinv10),_fjsp_sub_v2r8(rinvsq10,felec));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r20              = _fjsp_mul_v2r8(rsq20,rinv20);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _fjsp_mul_v2r8(iq2,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r20,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
+                                         &ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq20,rinv20),_fjsp_sub_v2r8(rinvsq20,felec));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            gmx_fjsp_decrement_1rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 146 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+            rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+            rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+
+            /* Load parameters for j particles */
+            jq0              = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),charge+jnrA+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _fjsp_mul_v2r8(iq0,jq0);
+            gmx_fjsp_load_1pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,&c6_00,&c12_00);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r00,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 8;
+            vfconv.i[1]     *= 8;
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r00,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq00,rinv00),_fjsp_sub_v2r8(rinvsq00,felec));
+
+            /* CUBIC SPLINE TABLE DISPERSION */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 2 );
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+            FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+            fvdw6            = _fjsp_mul_v2r8(c6_00,FF);
+
+            /* CUBIC SPLINE TABLE REPULSION */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 4 );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 6 );
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+            FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+            fvdw12           = _fjsp_mul_v2r8(c12_00,FF);
+            fvdw             = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_add_v2r8(fvdw6,fvdw12),_fjsp_mul_v2r8(vftabscale,rinv00)));
+
+            fscal            = _fjsp_add_v2r8(felec,fvdw);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r10              = _fjsp_mul_v2r8(rsq10,rinv10);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _fjsp_mul_v2r8(iq1,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r10,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq10,rinv10),_fjsp_sub_v2r8(rinvsq10,felec));
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r20              = _fjsp_mul_v2r8(rsq20,rinv20);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _fjsp_mul_v2r8(iq2,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r20,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq20,rinv20),_fjsp_sub_v2r8(rinvsq20,felec));
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            gmx_fjsp_decrement_1rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 146 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_fjsp_update_iforce_3atom_swizzle_v2r8(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 18 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3_F,outeriter*18 + inneriter*146);
+}
diff --git a/src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecEw_VdwCSTab_GeomW3W3_sparc64_hpc_ace_double.c b/src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecEw_VdwCSTab_GeomW3W3_sparc64_hpc_ace_double.c
new file mode 100644 (file)
index 0000000..6ab689a
--- /dev/null
@@ -0,0 +1,2172 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2012, by the GROMACS development team, led by
+ * David van der Spoel, Berk Hess, Erik Lindahl, and including many
+ * others, as listed in the AUTHORS file in the top-level source
+ * directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*
+ * Note: this file was generated by the GROMACS sparc64_hpc_ace_double kernel generator.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "kernelutil_sparc64_hpc_ace_double.h"
+
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecEw_VdwCSTab_GeomW3W3_VF_sparc64_hpc_ace_double
+ * Electrostatics interaction: Ewald
+ * VdW interaction:            CubicSplineTable
+ * Geometry:                   Water3-Water3
+ * Calculate force/pot:        PotentialAndForce
+ */
+void
+nb_kernel_ElecEw_VdwCSTab_GeomW3W3_VF_sparc64_hpc_ace_double
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwjidx0A,vdwjidx0B;
+    _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    int              vdwjidx1A,vdwjidx1B;
+    _fjsp_v2r8       jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+    int              vdwjidx2A,vdwjidx2B;
+    _fjsp_v2r8       jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+    _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    _fjsp_v2r8       dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
+    _fjsp_v2r8       dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
+    _fjsp_v2r8       dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    _fjsp_v2r8       dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+    _fjsp_v2r8       dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+    _fjsp_v2r8       dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    _fjsp_v2r8       dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+    _fjsp_v2r8       dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+    _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
+    _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
+    _fjsp_v2r8       rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF,twovfeps;
+    real             *vftab;
+    _fjsp_v2r8       ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+    real             *ewtab;
+    _fjsp_v2r8       itab_tmp;
+    _fjsp_v2r8       dummy_mask,cutoff_mask;
+    _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+    _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+    union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+
+    vftab            = kernel_data->table_vdw->data;
+    vftabscale       = gmx_fjsp_set1_v2r8(kernel_data->table_vdw->scale);
+
+    sh_ewald         = gmx_fjsp_set1_v2r8(fr->ic->sh_ewald);
+    ewtab            = fr->ic->tabq_coul_FDV0;
+    ewtabscale       = gmx_fjsp_set1_v2r8(fr->ic->tabq_scale);
+    ewtabhalfspace   = gmx_fjsp_set1_v2r8(0.5/fr->ic->tabq_scale);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq0              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+0]));
+    iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+    iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+    vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+    jq0              = gmx_fjsp_set1_v2r8(charge[inr+0]);
+    jq1              = gmx_fjsp_set1_v2r8(charge[inr+1]);
+    jq2              = gmx_fjsp_set1_v2r8(charge[inr+2]);
+    vdwjidx0A        = 2*vdwtype[inr+0];
+    qq00             = _fjsp_mul_v2r8(iq0,jq0);
+    c6_00            = gmx_fjsp_set1_v2r8(vdwparam[vdwioffset0+vdwjidx0A]);
+    c12_00           = gmx_fjsp_set1_v2r8(vdwparam[vdwioffset0+vdwjidx0A+1]);
+    qq01             = _fjsp_mul_v2r8(iq0,jq1);
+    qq02             = _fjsp_mul_v2r8(iq0,jq2);
+    qq10             = _fjsp_mul_v2r8(iq1,jq0);
+    qq11             = _fjsp_mul_v2r8(iq1,jq1);
+    qq12             = _fjsp_mul_v2r8(iq1,jq2);
+    qq20             = _fjsp_mul_v2r8(iq2,jq0);
+    qq21             = _fjsp_mul_v2r8(iq2,jq1);
+    qq22             = _fjsp_mul_v2r8(iq2,jq2);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_fjsp_load_shift_and_3rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
+
+        fix0             = _fjsp_setzero_v2r8();
+        fiy0             = _fjsp_setzero_v2r8();
+        fiz0             = _fjsp_setzero_v2r8();
+        fix1             = _fjsp_setzero_v2r8();
+        fiy1             = _fjsp_setzero_v2r8();
+        fiz1             = _fjsp_setzero_v2r8();
+        fix2             = _fjsp_setzero_v2r8();
+        fiy2             = _fjsp_setzero_v2r8();
+        fiz2             = _fjsp_setzero_v2r8();
+
+        /* Reset potential sums */
+        velecsum         = _fjsp_setzero_v2r8();
+        vvdwsum          = _fjsp_setzero_v2r8();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_3rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx01             = _fjsp_sub_v2r8(ix0,jx1);
+            dy01             = _fjsp_sub_v2r8(iy0,jy1);
+            dz01             = _fjsp_sub_v2r8(iz0,jz1);
+            dx02             = _fjsp_sub_v2r8(ix0,jx2);
+            dy02             = _fjsp_sub_v2r8(iy0,jy2);
+            dz02             = _fjsp_sub_v2r8(iz0,jz2);
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx11             = _fjsp_sub_v2r8(ix1,jx1);
+            dy11             = _fjsp_sub_v2r8(iy1,jy1);
+            dz11             = _fjsp_sub_v2r8(iz1,jz1);
+            dx12             = _fjsp_sub_v2r8(ix1,jx2);
+            dy12             = _fjsp_sub_v2r8(iy1,jy2);
+            dz12             = _fjsp_sub_v2r8(iz1,jz2);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+            dx21             = _fjsp_sub_v2r8(ix2,jx1);
+            dy21             = _fjsp_sub_v2r8(iy2,jy1);
+            dz21             = _fjsp_sub_v2r8(iz2,jz1);
+            dx22             = _fjsp_sub_v2r8(ix2,jx2);
+            dy22             = _fjsp_sub_v2r8(iy2,jy2);
+            dz22             = _fjsp_sub_v2r8(iz2,jz2);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq01            = gmx_fjsp_calc_rsq_v2r8(dx01,dy01,dz01);
+            rsq02            = gmx_fjsp_calc_rsq_v2r8(dx02,dy02,dz02);
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+            rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+            rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+            rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv01           = gmx_fjsp_invsqrt_v2r8(rsq01);
+            rinv02           = gmx_fjsp_invsqrt_v2r8(rsq02);
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+            rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+            rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+            rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+            rinvsq01         = _fjsp_mul_v2r8(rinv01,rinv01);
+            rinvsq02         = _fjsp_mul_v2r8(rinv02,rinv02);
+            rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+            rinvsq11         = _fjsp_mul_v2r8(rinv11,rinv11);
+            rinvsq12         = _fjsp_mul_v2r8(rinv12,rinv12);
+            rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+            rinvsq21         = _fjsp_mul_v2r8(rinv21,rinv21);
+            rinvsq22         = _fjsp_mul_v2r8(rinv22,rinv22);
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+            fjx1             = _fjsp_setzero_v2r8();
+            fjy1             = _fjsp_setzero_v2r8();
+            fjz1             = _fjsp_setzero_v2r8();
+            fjx2             = _fjsp_setzero_v2r8();
+            fjy2             = _fjsp_setzero_v2r8();
+            fjz2             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r00,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 8;
+            vfconv.i[1]     *= 8;
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r00,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq00,_fjsp_sub_v2r8(rinv00,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq00,rinv00),_fjsp_sub_v2r8(rinvsq00,felec));
+
+            /* CUBIC SPLINE TABLE DISPERSION */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 2 );
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 2 );
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            vvdw6            = _fjsp_mul_v2r8(c6_00,VV);
+            FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+            fvdw6            = _fjsp_mul_v2r8(c6_00,FF);
+
+            /* CUBIC SPLINE TABLE REPULSION */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 4 );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 4 );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 6 );
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 6 );
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            vvdw12           = _fjsp_mul_v2r8(c12_00,VV);
+            FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+            fvdw12           = _fjsp_mul_v2r8(c12_00,FF);
+            vvdw             = _fjsp_add_v2r8(vvdw12,vvdw6);
+            fvdw             = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_add_v2r8(fvdw6,fvdw12),_fjsp_mul_v2r8(vftabscale,rinv00)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+            vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
+
+            fscal            = _fjsp_add_v2r8(felec,fvdw);
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r01              = _fjsp_mul_v2r8(rsq01,rinv01);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r01,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq01,_fjsp_sub_v2r8(rinv01,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq01,rinv01),_fjsp_sub_v2r8(rinvsq01,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx01,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy01,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz01,fscal,fiz0);
+            
+            fjx1             = _fjsp_madd_v2r8(dx01,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy01,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz01,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r02              = _fjsp_mul_v2r8(rsq02,rinv02);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r02,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq02,_fjsp_sub_v2r8(rinv02,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq02,rinv02),_fjsp_sub_v2r8(rinvsq02,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx02,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy02,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz02,fscal,fiz0);
+            
+            fjx2             = _fjsp_madd_v2r8(dx02,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy02,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz02,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r10              = _fjsp_mul_v2r8(rsq10,rinv10);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r10,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq10,_fjsp_sub_v2r8(rinv10,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq10,rinv10),_fjsp_sub_v2r8(rinvsq10,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r11              = _fjsp_mul_v2r8(rsq11,rinv11);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r11,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq11,_fjsp_sub_v2r8(rinv11,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq11,rinv11),_fjsp_sub_v2r8(rinvsq11,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+            
+            fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r12              = _fjsp_mul_v2r8(rsq12,rinv12);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r12,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq12,_fjsp_sub_v2r8(rinv12,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq12,rinv12),_fjsp_sub_v2r8(rinvsq12,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+            
+            fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r20              = _fjsp_mul_v2r8(rsq20,rinv20);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r20,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq20,_fjsp_sub_v2r8(rinv20,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq20,rinv20),_fjsp_sub_v2r8(rinvsq20,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r21              = _fjsp_mul_v2r8(rsq21,rinv21);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r21,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq21,_fjsp_sub_v2r8(rinv21,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq21,rinv21),_fjsp_sub_v2r8(rinvsq21,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+            
+            fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r22              = _fjsp_mul_v2r8(rsq22,rinv22);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r22,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq22,_fjsp_sub_v2r8(rinv22,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq22,rinv22),_fjsp_sub_v2r8(rinvsq22,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+            
+            fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+
+            gmx_fjsp_decrement_3rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+
+            /* Inner loop uses 430 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_3rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx01             = _fjsp_sub_v2r8(ix0,jx1);
+            dy01             = _fjsp_sub_v2r8(iy0,jy1);
+            dz01             = _fjsp_sub_v2r8(iz0,jz1);
+            dx02             = _fjsp_sub_v2r8(ix0,jx2);
+            dy02             = _fjsp_sub_v2r8(iy0,jy2);
+            dz02             = _fjsp_sub_v2r8(iz0,jz2);
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx11             = _fjsp_sub_v2r8(ix1,jx1);
+            dy11             = _fjsp_sub_v2r8(iy1,jy1);
+            dz11             = _fjsp_sub_v2r8(iz1,jz1);
+            dx12             = _fjsp_sub_v2r8(ix1,jx2);
+            dy12             = _fjsp_sub_v2r8(iy1,jy2);
+            dz12             = _fjsp_sub_v2r8(iz1,jz2);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+            dx21             = _fjsp_sub_v2r8(ix2,jx1);
+            dy21             = _fjsp_sub_v2r8(iy2,jy1);
+            dz21             = _fjsp_sub_v2r8(iz2,jz1);
+            dx22             = _fjsp_sub_v2r8(ix2,jx2);
+            dy22             = _fjsp_sub_v2r8(iy2,jy2);
+            dz22             = _fjsp_sub_v2r8(iz2,jz2);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq01            = gmx_fjsp_calc_rsq_v2r8(dx01,dy01,dz01);
+            rsq02            = gmx_fjsp_calc_rsq_v2r8(dx02,dy02,dz02);
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+            rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+            rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+            rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv01           = gmx_fjsp_invsqrt_v2r8(rsq01);
+            rinv02           = gmx_fjsp_invsqrt_v2r8(rsq02);
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+            rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+            rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+            rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+            rinvsq01         = _fjsp_mul_v2r8(rinv01,rinv01);
+            rinvsq02         = _fjsp_mul_v2r8(rinv02,rinv02);
+            rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+            rinvsq11         = _fjsp_mul_v2r8(rinv11,rinv11);
+            rinvsq12         = _fjsp_mul_v2r8(rinv12,rinv12);
+            rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+            rinvsq21         = _fjsp_mul_v2r8(rinv21,rinv21);
+            rinvsq22         = _fjsp_mul_v2r8(rinv22,rinv22);
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+            fjx1             = _fjsp_setzero_v2r8();
+            fjy1             = _fjsp_setzero_v2r8();
+            fjz1             = _fjsp_setzero_v2r8();
+            fjx2             = _fjsp_setzero_v2r8();
+            fjy2             = _fjsp_setzero_v2r8();
+            fjz2             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r00,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 8;
+            vfconv.i[1]     *= 8;
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r00,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq00,_fjsp_sub_v2r8(rinv00,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq00,rinv00),_fjsp_sub_v2r8(rinvsq00,felec));
+
+            /* CUBIC SPLINE TABLE DISPERSION */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 2 );
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            vvdw6            = _fjsp_mul_v2r8(c6_00,VV);
+            FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+            fvdw6            = _fjsp_mul_v2r8(c6_00,FF);
+
+            /* CUBIC SPLINE TABLE REPULSION */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 4 );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 6 );
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            vvdw12           = _fjsp_mul_v2r8(c12_00,VV);
+            FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+            fvdw12           = _fjsp_mul_v2r8(c12_00,FF);
+            vvdw             = _fjsp_add_v2r8(vvdw12,vvdw6);
+            fvdw             = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_add_v2r8(fvdw6,fvdw12),_fjsp_mul_v2r8(vftabscale,rinv00)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+            vvdw             = _fjsp_unpacklo_v2r8(vvdw,_fjsp_setzero_v2r8());
+            vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
+
+            fscal            = _fjsp_add_v2r8(felec,fvdw);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r01              = _fjsp_mul_v2r8(rsq01,rinv01);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r01,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq01,_fjsp_sub_v2r8(rinv01,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq01,rinv01),_fjsp_sub_v2r8(rinvsq01,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx01,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy01,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz01,fscal,fiz0);
+            
+            fjx1             = _fjsp_madd_v2r8(dx01,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy01,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz01,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r02              = _fjsp_mul_v2r8(rsq02,rinv02);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r02,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq02,_fjsp_sub_v2r8(rinv02,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq02,rinv02),_fjsp_sub_v2r8(rinvsq02,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx02,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy02,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz02,fscal,fiz0);
+            
+            fjx2             = _fjsp_madd_v2r8(dx02,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy02,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz02,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r10              = _fjsp_mul_v2r8(rsq10,rinv10);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r10,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq10,_fjsp_sub_v2r8(rinv10,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq10,rinv10),_fjsp_sub_v2r8(rinvsq10,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r11              = _fjsp_mul_v2r8(rsq11,rinv11);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r11,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq11,_fjsp_sub_v2r8(rinv11,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq11,rinv11),_fjsp_sub_v2r8(rinvsq11,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+            
+            fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r12              = _fjsp_mul_v2r8(rsq12,rinv12);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r12,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq12,_fjsp_sub_v2r8(rinv12,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq12,rinv12),_fjsp_sub_v2r8(rinvsq12,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+            
+            fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r20              = _fjsp_mul_v2r8(rsq20,rinv20);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r20,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq20,_fjsp_sub_v2r8(rinv20,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq20,rinv20),_fjsp_sub_v2r8(rinvsq20,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r21              = _fjsp_mul_v2r8(rsq21,rinv21);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r21,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq21,_fjsp_sub_v2r8(rinv21,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq21,rinv21),_fjsp_sub_v2r8(rinvsq21,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+            
+            fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r22              = _fjsp_mul_v2r8(rsq22,rinv22);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r22,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq22,_fjsp_sub_v2r8(rinv22,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq22,rinv22),_fjsp_sub_v2r8(rinvsq22,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+            
+            fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+
+            gmx_fjsp_decrement_3rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+
+            /* Inner loop uses 430 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_fjsp_update_iforce_3atom_swizzle_v2r8(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        ggid                        = gid[iidx];
+        /* Update potential energies */
+        gmx_fjsp_update_1pot_v2r8(velecsum,kernel_data->energygrp_elec+ggid);
+        gmx_fjsp_update_1pot_v2r8(vvdwsum,kernel_data->energygrp_vdw+ggid);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 20 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_VF,outeriter*20 + inneriter*430);
+}
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecEw_VdwCSTab_GeomW3W3_F_sparc64_hpc_ace_double
+ * Electrostatics interaction: Ewald
+ * VdW interaction:            CubicSplineTable
+ * Geometry:                   Water3-Water3
+ * Calculate force/pot:        Force
+ */
+void
+nb_kernel_ElecEw_VdwCSTab_GeomW3W3_F_sparc64_hpc_ace_double
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwjidx0A,vdwjidx0B;
+    _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    int              vdwjidx1A,vdwjidx1B;
+    _fjsp_v2r8       jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+    int              vdwjidx2A,vdwjidx2B;
+    _fjsp_v2r8       jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+    _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    _fjsp_v2r8       dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
+    _fjsp_v2r8       dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
+    _fjsp_v2r8       dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    _fjsp_v2r8       dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+    _fjsp_v2r8       dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+    _fjsp_v2r8       dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    _fjsp_v2r8       dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+    _fjsp_v2r8       dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+    _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
+    _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
+    _fjsp_v2r8       rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF,twovfeps;
+    real             *vftab;
+    _fjsp_v2r8       ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+    real             *ewtab;
+    _fjsp_v2r8       itab_tmp;
+    _fjsp_v2r8       dummy_mask,cutoff_mask;
+    _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+    _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+    union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+
+    vftab            = kernel_data->table_vdw->data;
+    vftabscale       = gmx_fjsp_set1_v2r8(kernel_data->table_vdw->scale);
+
+    sh_ewald         = gmx_fjsp_set1_v2r8(fr->ic->sh_ewald);
+    ewtab            = fr->ic->tabq_coul_F;
+    ewtabscale       = gmx_fjsp_set1_v2r8(fr->ic->tabq_scale);
+    ewtabhalfspace   = gmx_fjsp_set1_v2r8(0.5/fr->ic->tabq_scale);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq0              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+0]));
+    iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+    iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+    vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+    jq0              = gmx_fjsp_set1_v2r8(charge[inr+0]);
+    jq1              = gmx_fjsp_set1_v2r8(charge[inr+1]);
+    jq2              = gmx_fjsp_set1_v2r8(charge[inr+2]);
+    vdwjidx0A        = 2*vdwtype[inr+0];
+    qq00             = _fjsp_mul_v2r8(iq0,jq0);
+    c6_00            = gmx_fjsp_set1_v2r8(vdwparam[vdwioffset0+vdwjidx0A]);
+    c12_00           = gmx_fjsp_set1_v2r8(vdwparam[vdwioffset0+vdwjidx0A+1]);
+    qq01             = _fjsp_mul_v2r8(iq0,jq1);
+    qq02             = _fjsp_mul_v2r8(iq0,jq2);
+    qq10             = _fjsp_mul_v2r8(iq1,jq0);
+    qq11             = _fjsp_mul_v2r8(iq1,jq1);
+    qq12             = _fjsp_mul_v2r8(iq1,jq2);
+    qq20             = _fjsp_mul_v2r8(iq2,jq0);
+    qq21             = _fjsp_mul_v2r8(iq2,jq1);
+    qq22             = _fjsp_mul_v2r8(iq2,jq2);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_fjsp_load_shift_and_3rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
+
+        fix0             = _fjsp_setzero_v2r8();
+        fiy0             = _fjsp_setzero_v2r8();
+        fiz0             = _fjsp_setzero_v2r8();
+        fix1             = _fjsp_setzero_v2r8();
+        fiy1             = _fjsp_setzero_v2r8();
+        fiz1             = _fjsp_setzero_v2r8();
+        fix2             = _fjsp_setzero_v2r8();
+        fiy2             = _fjsp_setzero_v2r8();
+        fiz2             = _fjsp_setzero_v2r8();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_3rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx01             = _fjsp_sub_v2r8(ix0,jx1);
+            dy01             = _fjsp_sub_v2r8(iy0,jy1);
+            dz01             = _fjsp_sub_v2r8(iz0,jz1);
+            dx02             = _fjsp_sub_v2r8(ix0,jx2);
+            dy02             = _fjsp_sub_v2r8(iy0,jy2);
+            dz02             = _fjsp_sub_v2r8(iz0,jz2);
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx11             = _fjsp_sub_v2r8(ix1,jx1);
+            dy11             = _fjsp_sub_v2r8(iy1,jy1);
+            dz11             = _fjsp_sub_v2r8(iz1,jz1);
+            dx12             = _fjsp_sub_v2r8(ix1,jx2);
+            dy12             = _fjsp_sub_v2r8(iy1,jy2);
+            dz12             = _fjsp_sub_v2r8(iz1,jz2);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+            dx21             = _fjsp_sub_v2r8(ix2,jx1);
+            dy21             = _fjsp_sub_v2r8(iy2,jy1);
+            dz21             = _fjsp_sub_v2r8(iz2,jz1);
+            dx22             = _fjsp_sub_v2r8(ix2,jx2);
+            dy22             = _fjsp_sub_v2r8(iy2,jy2);
+            dz22             = _fjsp_sub_v2r8(iz2,jz2);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq01            = gmx_fjsp_calc_rsq_v2r8(dx01,dy01,dz01);
+            rsq02            = gmx_fjsp_calc_rsq_v2r8(dx02,dy02,dz02);
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+            rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+            rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+            rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv01           = gmx_fjsp_invsqrt_v2r8(rsq01);
+            rinv02           = gmx_fjsp_invsqrt_v2r8(rsq02);
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+            rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+            rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+            rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+            rinvsq01         = _fjsp_mul_v2r8(rinv01,rinv01);
+            rinvsq02         = _fjsp_mul_v2r8(rinv02,rinv02);
+            rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+            rinvsq11         = _fjsp_mul_v2r8(rinv11,rinv11);
+            rinvsq12         = _fjsp_mul_v2r8(rinv12,rinv12);
+            rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+            rinvsq21         = _fjsp_mul_v2r8(rinv21,rinv21);
+            rinvsq22         = _fjsp_mul_v2r8(rinv22,rinv22);
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+            fjx1             = _fjsp_setzero_v2r8();
+            fjy1             = _fjsp_setzero_v2r8();
+            fjz1             = _fjsp_setzero_v2r8();
+            fjx2             = _fjsp_setzero_v2r8();
+            fjy2             = _fjsp_setzero_v2r8();
+            fjz2             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r00,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 8;
+            vfconv.i[1]     *= 8;
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r00,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
+                                         &ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq00,rinv00),_fjsp_sub_v2r8(rinvsq00,felec));
+
+            /* CUBIC SPLINE TABLE DISPERSION */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 2 );
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 2 );
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+            FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+            fvdw6            = _fjsp_mul_v2r8(c6_00,FF);
+
+            /* CUBIC SPLINE TABLE REPULSION */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 4 );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 4 );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 6 );
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 6 );
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+            FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+            fvdw12           = _fjsp_mul_v2r8(c12_00,FF);
+            fvdw             = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_add_v2r8(fvdw6,fvdw12),_fjsp_mul_v2r8(vftabscale,rinv00)));
+
+            fscal            = _fjsp_add_v2r8(felec,fvdw);
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r01              = _fjsp_mul_v2r8(rsq01,rinv01);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r01,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
+                                         &ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq01,rinv01),_fjsp_sub_v2r8(rinvsq01,felec));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx01,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy01,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz01,fscal,fiz0);
+            
+            fjx1             = _fjsp_madd_v2r8(dx01,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy01,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz01,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r02              = _fjsp_mul_v2r8(rsq02,rinv02);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r02,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
+                                         &ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq02,rinv02),_fjsp_sub_v2r8(rinvsq02,felec));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx02,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy02,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz02,fscal,fiz0);
+            
+            fjx2             = _fjsp_madd_v2r8(dx02,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy02,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz02,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r10              = _fjsp_mul_v2r8(rsq10,rinv10);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r10,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
+                                         &ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq10,rinv10),_fjsp_sub_v2r8(rinvsq10,felec));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r11              = _fjsp_mul_v2r8(rsq11,rinv11);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r11,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
+                                         &ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq11,rinv11),_fjsp_sub_v2r8(rinvsq11,felec));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+            
+            fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r12              = _fjsp_mul_v2r8(rsq12,rinv12);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r12,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
+                                         &ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq12,rinv12),_fjsp_sub_v2r8(rinvsq12,felec));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+            
+            fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r20              = _fjsp_mul_v2r8(rsq20,rinv20);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r20,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
+                                         &ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq20,rinv20),_fjsp_sub_v2r8(rinvsq20,felec));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r21              = _fjsp_mul_v2r8(rsq21,rinv21);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r21,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
+                                         &ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq21,rinv21),_fjsp_sub_v2r8(rinvsq21,felec));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+            
+            fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r22              = _fjsp_mul_v2r8(rsq22,rinv22);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r22,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
+                                         &ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq22,rinv22),_fjsp_sub_v2r8(rinvsq22,felec));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+            
+            fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+
+            gmx_fjsp_decrement_3rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+
+            /* Inner loop uses 377 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_3rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx01             = _fjsp_sub_v2r8(ix0,jx1);
+            dy01             = _fjsp_sub_v2r8(iy0,jy1);
+            dz01             = _fjsp_sub_v2r8(iz0,jz1);
+            dx02             = _fjsp_sub_v2r8(ix0,jx2);
+            dy02             = _fjsp_sub_v2r8(iy0,jy2);
+            dz02             = _fjsp_sub_v2r8(iz0,jz2);
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx11             = _fjsp_sub_v2r8(ix1,jx1);
+            dy11             = _fjsp_sub_v2r8(iy1,jy1);
+            dz11             = _fjsp_sub_v2r8(iz1,jz1);
+            dx12             = _fjsp_sub_v2r8(ix1,jx2);
+            dy12             = _fjsp_sub_v2r8(iy1,jy2);
+            dz12             = _fjsp_sub_v2r8(iz1,jz2);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+            dx21             = _fjsp_sub_v2r8(ix2,jx1);
+            dy21             = _fjsp_sub_v2r8(iy2,jy1);
+            dz21             = _fjsp_sub_v2r8(iz2,jz1);
+            dx22             = _fjsp_sub_v2r8(ix2,jx2);
+            dy22             = _fjsp_sub_v2r8(iy2,jy2);
+            dz22             = _fjsp_sub_v2r8(iz2,jz2);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq01            = gmx_fjsp_calc_rsq_v2r8(dx01,dy01,dz01);
+            rsq02            = gmx_fjsp_calc_rsq_v2r8(dx02,dy02,dz02);
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+            rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+            rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+            rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv01           = gmx_fjsp_invsqrt_v2r8(rsq01);
+            rinv02           = gmx_fjsp_invsqrt_v2r8(rsq02);
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+            rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+            rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+            rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+            rinvsq01         = _fjsp_mul_v2r8(rinv01,rinv01);
+            rinvsq02         = _fjsp_mul_v2r8(rinv02,rinv02);
+            rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+            rinvsq11         = _fjsp_mul_v2r8(rinv11,rinv11);
+            rinvsq12         = _fjsp_mul_v2r8(rinv12,rinv12);
+            rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+            rinvsq21         = _fjsp_mul_v2r8(rinv21,rinv21);
+            rinvsq22         = _fjsp_mul_v2r8(rinv22,rinv22);
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+            fjx1             = _fjsp_setzero_v2r8();
+            fjy1             = _fjsp_setzero_v2r8();
+            fjz1             = _fjsp_setzero_v2r8();
+            fjx2             = _fjsp_setzero_v2r8();
+            fjy2             = _fjsp_setzero_v2r8();
+            fjz2             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r00,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 8;
+            vfconv.i[1]     *= 8;
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r00,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq00,rinv00),_fjsp_sub_v2r8(rinvsq00,felec));
+
+            /* CUBIC SPLINE TABLE DISPERSION */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 2 );
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+            FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+            fvdw6            = _fjsp_mul_v2r8(c6_00,FF);
+
+            /* CUBIC SPLINE TABLE REPULSION */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 4 );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 6 );
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+            FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+            fvdw12           = _fjsp_mul_v2r8(c12_00,FF);
+            fvdw             = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_add_v2r8(fvdw6,fvdw12),_fjsp_mul_v2r8(vftabscale,rinv00)));
+
+            fscal            = _fjsp_add_v2r8(felec,fvdw);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r01              = _fjsp_mul_v2r8(rsq01,rinv01);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r01,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq01,rinv01),_fjsp_sub_v2r8(rinvsq01,felec));
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx01,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy01,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz01,fscal,fiz0);
+            
+            fjx1             = _fjsp_madd_v2r8(dx01,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy01,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz01,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r02              = _fjsp_mul_v2r8(rsq02,rinv02);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r02,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq02,rinv02),_fjsp_sub_v2r8(rinvsq02,felec));
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx02,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy02,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz02,fscal,fiz0);
+            
+            fjx2             = _fjsp_madd_v2r8(dx02,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy02,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz02,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r10              = _fjsp_mul_v2r8(rsq10,rinv10);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r10,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq10,rinv10),_fjsp_sub_v2r8(rinvsq10,felec));
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r11              = _fjsp_mul_v2r8(rsq11,rinv11);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r11,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq11,rinv11),_fjsp_sub_v2r8(rinvsq11,felec));
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+            
+            fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r12              = _fjsp_mul_v2r8(rsq12,rinv12);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r12,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq12,rinv12),_fjsp_sub_v2r8(rinvsq12,felec));
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+            
+            fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r20              = _fjsp_mul_v2r8(rsq20,rinv20);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r20,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq20,rinv20),_fjsp_sub_v2r8(rinvsq20,felec));
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r21              = _fjsp_mul_v2r8(rsq21,rinv21);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r21,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq21,rinv21),_fjsp_sub_v2r8(rinvsq21,felec));
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+            
+            fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r22              = _fjsp_mul_v2r8(rsq22,rinv22);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r22,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq22,rinv22),_fjsp_sub_v2r8(rinvsq22,felec));
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+            
+            fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+
+            gmx_fjsp_decrement_3rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+
+            /* Inner loop uses 377 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_fjsp_update_iforce_3atom_swizzle_v2r8(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 18 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_F,outeriter*18 + inneriter*377);
+}
diff --git a/src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecEw_VdwCSTab_GeomW4P1_sparc64_hpc_ace_double.c b/src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecEw_VdwCSTab_GeomW4P1_sparc64_hpc_ace_double.c
new file mode 100644 (file)
index 0000000..554c18d
--- /dev/null
@@ -0,0 +1,1276 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2012, by the GROMACS development team, led by
+ * David van der Spoel, Berk Hess, Erik Lindahl, and including many
+ * others, as listed in the AUTHORS file in the top-level source
+ * directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*
+ * Note: this file was generated by the GROMACS sparc64_hpc_ace_double kernel generator.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "kernelutil_sparc64_hpc_ace_double.h"
+
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecEw_VdwCSTab_GeomW4P1_VF_sparc64_hpc_ace_double
+ * Electrostatics interaction: Ewald
+ * VdW interaction:            CubicSplineTable
+ * Geometry:                   Water4-Particle
+ * Calculate force/pot:        PotentialAndForce
+ */
+void
+nb_kernel_ElecEw_VdwCSTab_GeomW4P1_VF_sparc64_hpc_ace_double
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwioffset3;
+    _fjsp_v2r8       ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+    int              vdwjidx0A,vdwjidx0B;
+    _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    _fjsp_v2r8       dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    _fjsp_v2r8       dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    _fjsp_v2r8       dx30,dy30,dz30,rsq30,rinv30,rinvsq30,r30,qq30,c6_30,c12_30;
+    _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
+    _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
+    _fjsp_v2r8       rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF,twovfeps;
+    real             *vftab;
+    _fjsp_v2r8       ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+    real             *ewtab;
+    _fjsp_v2r8       itab_tmp;
+    _fjsp_v2r8       dummy_mask,cutoff_mask;
+    _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+    _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+    union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+
+    vftab            = kernel_data->table_vdw->data;
+    vftabscale       = gmx_fjsp_set1_v2r8(kernel_data->table_vdw->scale);
+
+    sh_ewald         = gmx_fjsp_set1_v2r8(fr->ic->sh_ewald);
+    ewtab            = fr->ic->tabq_coul_FDV0;
+    ewtabscale       = gmx_fjsp_set1_v2r8(fr->ic->tabq_scale);
+    ewtabhalfspace   = gmx_fjsp_set1_v2r8(0.5/fr->ic->tabq_scale);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+    iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+    iq3              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+3]));
+    vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_fjsp_load_shift_and_4rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
+
+        fix0             = _fjsp_setzero_v2r8();
+        fiy0             = _fjsp_setzero_v2r8();
+        fiz0             = _fjsp_setzero_v2r8();
+        fix1             = _fjsp_setzero_v2r8();
+        fiy1             = _fjsp_setzero_v2r8();
+        fiz1             = _fjsp_setzero_v2r8();
+        fix2             = _fjsp_setzero_v2r8();
+        fiy2             = _fjsp_setzero_v2r8();
+        fiz2             = _fjsp_setzero_v2r8();
+        fix3             = _fjsp_setzero_v2r8();
+        fiy3             = _fjsp_setzero_v2r8();
+        fiz3             = _fjsp_setzero_v2r8();
+
+        /* Reset potential sums */
+        velecsum         = _fjsp_setzero_v2r8();
+        vvdwsum          = _fjsp_setzero_v2r8();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+            dx30             = _fjsp_sub_v2r8(ix3,jx0);
+            dy30             = _fjsp_sub_v2r8(iy3,jy0);
+            dz30             = _fjsp_sub_v2r8(iz3,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+            rsq30            = gmx_fjsp_calc_rsq_v2r8(dx30,dy30,dz30);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+            rinv30           = gmx_fjsp_invsqrt_v2r8(rsq30);
+
+            rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+            rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+            rinvsq30         = _fjsp_mul_v2r8(rinv30,rinv30);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_fjsp_load_2real_swizzle_v2r8(charge+jnrA+0,charge+jnrB+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            gmx_fjsp_load_2pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,&c6_00,&c12_00);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r00,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 8;
+            vfconv.i[1]     *= 8;
+
+            /* CUBIC SPLINE TABLE DISPERSION */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 2 );
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 2 );
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            vvdw6            = _fjsp_mul_v2r8(c6_00,VV);
+            FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+            fvdw6            = _fjsp_mul_v2r8(c6_00,FF);
+
+            /* CUBIC SPLINE TABLE REPULSION */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 4 );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 4 );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 6 );
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 6 );
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            vvdw12           = _fjsp_mul_v2r8(c12_00,VV);
+            FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+            fvdw12           = _fjsp_mul_v2r8(c12_00,FF);
+            vvdw             = _fjsp_add_v2r8(vvdw12,vvdw6);
+            fvdw             = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_add_v2r8(fvdw6,fvdw12),_fjsp_mul_v2r8(vftabscale,rinv00)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
+
+            fscal            = fvdw;
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r10              = _fjsp_mul_v2r8(rsq10,rinv10);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _fjsp_mul_v2r8(iq1,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r10,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq10,_fjsp_sub_v2r8(rinv10,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq10,rinv10),_fjsp_sub_v2r8(rinvsq10,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r20              = _fjsp_mul_v2r8(rsq20,rinv20);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _fjsp_mul_v2r8(iq2,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r20,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq20,_fjsp_sub_v2r8(rinv20,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq20,rinv20),_fjsp_sub_v2r8(rinvsq20,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r30              = _fjsp_mul_v2r8(rsq30,rinv30);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq30             = _fjsp_mul_v2r8(iq3,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r30,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq30,_fjsp_sub_v2r8(rinv30,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq30,rinv30),_fjsp_sub_v2r8(rinvsq30,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx30,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy30,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz30,fscal,fiz3);
+            
+            fjx0             = _fjsp_madd_v2r8(dx30,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy30,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz30,fscal,fjz0);
+
+            gmx_fjsp_decrement_1rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 194 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+            dx30             = _fjsp_sub_v2r8(ix3,jx0);
+            dy30             = _fjsp_sub_v2r8(iy3,jy0);
+            dz30             = _fjsp_sub_v2r8(iz3,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+            rsq30            = gmx_fjsp_calc_rsq_v2r8(dx30,dy30,dz30);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+            rinv30           = gmx_fjsp_invsqrt_v2r8(rsq30);
+
+            rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+            rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+            rinvsq30         = _fjsp_mul_v2r8(rinv30,rinv30);
+
+            /* Load parameters for j particles */
+            jq0              = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),charge+jnrA+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            gmx_fjsp_load_1pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,&c6_00,&c12_00);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r00,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 8;
+            vfconv.i[1]     *= 8;
+
+            /* CUBIC SPLINE TABLE DISPERSION */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 2 );
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            vvdw6            = _fjsp_mul_v2r8(c6_00,VV);
+            FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+            fvdw6            = _fjsp_mul_v2r8(c6_00,FF);
+
+            /* CUBIC SPLINE TABLE REPULSION */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 4 );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 6 );
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            vvdw12           = _fjsp_mul_v2r8(c12_00,VV);
+            FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+            fvdw12           = _fjsp_mul_v2r8(c12_00,FF);
+            vvdw             = _fjsp_add_v2r8(vvdw12,vvdw6);
+            fvdw             = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_add_v2r8(fvdw6,fvdw12),_fjsp_mul_v2r8(vftabscale,rinv00)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            vvdw             = _fjsp_unpacklo_v2r8(vvdw,_fjsp_setzero_v2r8());
+            vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
+
+            fscal            = fvdw;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r10              = _fjsp_mul_v2r8(rsq10,rinv10);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _fjsp_mul_v2r8(iq1,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r10,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq10,_fjsp_sub_v2r8(rinv10,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq10,rinv10),_fjsp_sub_v2r8(rinvsq10,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r20              = _fjsp_mul_v2r8(rsq20,rinv20);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _fjsp_mul_v2r8(iq2,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r20,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq20,_fjsp_sub_v2r8(rinv20,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq20,rinv20),_fjsp_sub_v2r8(rinvsq20,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r30              = _fjsp_mul_v2r8(rsq30,rinv30);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq30             = _fjsp_mul_v2r8(iq3,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r30,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq30,_fjsp_sub_v2r8(rinv30,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq30,rinv30),_fjsp_sub_v2r8(rinvsq30,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx30,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy30,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz30,fscal,fiz3);
+            
+            fjx0             = _fjsp_madd_v2r8(dx30,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy30,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz30,fscal,fjz0);
+
+            gmx_fjsp_decrement_1rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 194 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_fjsp_update_iforce_4atom_swizzle_v2r8(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        ggid                        = gid[iidx];
+        /* Update potential energies */
+        gmx_fjsp_update_1pot_v2r8(velecsum,kernel_data->energygrp_elec+ggid);
+        gmx_fjsp_update_1pot_v2r8(vvdwsum,kernel_data->energygrp_vdw+ggid);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 26 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4_VF,outeriter*26 + inneriter*194);
+}
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecEw_VdwCSTab_GeomW4P1_F_sparc64_hpc_ace_double
+ * Electrostatics interaction: Ewald
+ * VdW interaction:            CubicSplineTable
+ * Geometry:                   Water4-Particle
+ * Calculate force/pot:        Force
+ */
+void
+nb_kernel_ElecEw_VdwCSTab_GeomW4P1_F_sparc64_hpc_ace_double
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwioffset3;
+    _fjsp_v2r8       ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+    int              vdwjidx0A,vdwjidx0B;
+    _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    _fjsp_v2r8       dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    _fjsp_v2r8       dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    _fjsp_v2r8       dx30,dy30,dz30,rsq30,rinv30,rinvsq30,r30,qq30,c6_30,c12_30;
+    _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
+    _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
+    _fjsp_v2r8       rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF,twovfeps;
+    real             *vftab;
+    _fjsp_v2r8       ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+    real             *ewtab;
+    _fjsp_v2r8       itab_tmp;
+    _fjsp_v2r8       dummy_mask,cutoff_mask;
+    _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+    _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+    union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+
+    vftab            = kernel_data->table_vdw->data;
+    vftabscale       = gmx_fjsp_set1_v2r8(kernel_data->table_vdw->scale);
+
+    sh_ewald         = gmx_fjsp_set1_v2r8(fr->ic->sh_ewald);
+    ewtab            = fr->ic->tabq_coul_F;
+    ewtabscale       = gmx_fjsp_set1_v2r8(fr->ic->tabq_scale);
+    ewtabhalfspace   = gmx_fjsp_set1_v2r8(0.5/fr->ic->tabq_scale);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+    iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+    iq3              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+3]));
+    vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_fjsp_load_shift_and_4rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
+
+        fix0             = _fjsp_setzero_v2r8();
+        fiy0             = _fjsp_setzero_v2r8();
+        fiz0             = _fjsp_setzero_v2r8();
+        fix1             = _fjsp_setzero_v2r8();
+        fiy1             = _fjsp_setzero_v2r8();
+        fiz1             = _fjsp_setzero_v2r8();
+        fix2             = _fjsp_setzero_v2r8();
+        fiy2             = _fjsp_setzero_v2r8();
+        fiz2             = _fjsp_setzero_v2r8();
+        fix3             = _fjsp_setzero_v2r8();
+        fiy3             = _fjsp_setzero_v2r8();
+        fiz3             = _fjsp_setzero_v2r8();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+            dx30             = _fjsp_sub_v2r8(ix3,jx0);
+            dy30             = _fjsp_sub_v2r8(iy3,jy0);
+            dz30             = _fjsp_sub_v2r8(iz3,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+            rsq30            = gmx_fjsp_calc_rsq_v2r8(dx30,dy30,dz30);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+            rinv30           = gmx_fjsp_invsqrt_v2r8(rsq30);
+
+            rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+            rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+            rinvsq30         = _fjsp_mul_v2r8(rinv30,rinv30);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_fjsp_load_2real_swizzle_v2r8(charge+jnrA+0,charge+jnrB+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            gmx_fjsp_load_2pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,&c6_00,&c12_00);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r00,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 8;
+            vfconv.i[1]     *= 8;
+
+            /* CUBIC SPLINE TABLE DISPERSION */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 2 );
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 2 );
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+            FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+            fvdw6            = _fjsp_mul_v2r8(c6_00,FF);
+
+            /* CUBIC SPLINE TABLE REPULSION */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 4 );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 4 );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 6 );
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 6 );
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+            FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+            fvdw12           = _fjsp_mul_v2r8(c12_00,FF);
+            fvdw             = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_add_v2r8(fvdw6,fvdw12),_fjsp_mul_v2r8(vftabscale,rinv00)));
+
+            fscal            = fvdw;
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r10              = _fjsp_mul_v2r8(rsq10,rinv10);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _fjsp_mul_v2r8(iq1,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r10,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
+                                         &ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq10,rinv10),_fjsp_sub_v2r8(rinvsq10,felec));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r20              = _fjsp_mul_v2r8(rsq20,rinv20);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _fjsp_mul_v2r8(iq2,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r20,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
+                                         &ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq20,rinv20),_fjsp_sub_v2r8(rinvsq20,felec));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r30              = _fjsp_mul_v2r8(rsq30,rinv30);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq30             = _fjsp_mul_v2r8(iq3,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r30,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
+                                         &ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq30,rinv30),_fjsp_sub_v2r8(rinvsq30,felec));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx30,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy30,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz30,fscal,fiz3);
+            
+            fjx0             = _fjsp_madd_v2r8(dx30,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy30,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz30,fscal,fjz0);
+
+            gmx_fjsp_decrement_1rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 171 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+            dx30             = _fjsp_sub_v2r8(ix3,jx0);
+            dy30             = _fjsp_sub_v2r8(iy3,jy0);
+            dz30             = _fjsp_sub_v2r8(iz3,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+            rsq30            = gmx_fjsp_calc_rsq_v2r8(dx30,dy30,dz30);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+            rinv30           = gmx_fjsp_invsqrt_v2r8(rsq30);
+
+            rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+            rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+            rinvsq30         = _fjsp_mul_v2r8(rinv30,rinv30);
+
+            /* Load parameters for j particles */
+            jq0              = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),charge+jnrA+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            gmx_fjsp_load_1pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,&c6_00,&c12_00);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r00,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 8;
+            vfconv.i[1]     *= 8;
+
+            /* CUBIC SPLINE TABLE DISPERSION */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 2 );
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+            FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+            fvdw6            = _fjsp_mul_v2r8(c6_00,FF);
+
+            /* CUBIC SPLINE TABLE REPULSION */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 4 );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 6 );
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+            FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+            fvdw12           = _fjsp_mul_v2r8(c12_00,FF);
+            fvdw             = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_add_v2r8(fvdw6,fvdw12),_fjsp_mul_v2r8(vftabscale,rinv00)));
+
+            fscal            = fvdw;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r10              = _fjsp_mul_v2r8(rsq10,rinv10);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _fjsp_mul_v2r8(iq1,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r10,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq10,rinv10),_fjsp_sub_v2r8(rinvsq10,felec));
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r20              = _fjsp_mul_v2r8(rsq20,rinv20);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _fjsp_mul_v2r8(iq2,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r20,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq20,rinv20),_fjsp_sub_v2r8(rinvsq20,felec));
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r30              = _fjsp_mul_v2r8(rsq30,rinv30);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq30             = _fjsp_mul_v2r8(iq3,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r30,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq30,rinv30),_fjsp_sub_v2r8(rinvsq30,felec));
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx30,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy30,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz30,fscal,fiz3);
+            
+            fjx0             = _fjsp_madd_v2r8(dx30,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy30,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz30,fscal,fjz0);
+
+            gmx_fjsp_decrement_1rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 171 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_fjsp_update_iforce_4atom_swizzle_v2r8(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 24 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4_F,outeriter*24 + inneriter*171);
+}
diff --git a/src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecEw_VdwCSTab_GeomW4W4_sparc64_hpc_ace_double.c b/src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecEw_VdwCSTab_GeomW4W4_sparc64_hpc_ace_double.c
new file mode 100644 (file)
index 0000000..63d5a18
--- /dev/null
@@ -0,0 +1,2300 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2012, by the GROMACS development team, led by
+ * David van der Spoel, Berk Hess, Erik Lindahl, and including many
+ * others, as listed in the AUTHORS file in the top-level source
+ * directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*
+ * Note: this file was generated by the GROMACS sparc64_hpc_ace_double kernel generator.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "kernelutil_sparc64_hpc_ace_double.h"
+
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecEw_VdwCSTab_GeomW4W4_VF_sparc64_hpc_ace_double
+ * Electrostatics interaction: Ewald
+ * VdW interaction:            CubicSplineTable
+ * Geometry:                   Water4-Water4
+ * Calculate force/pot:        PotentialAndForce
+ */
+void
+nb_kernel_ElecEw_VdwCSTab_GeomW4W4_VF_sparc64_hpc_ace_double
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwioffset3;
+    _fjsp_v2r8       ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+    int              vdwjidx0A,vdwjidx0B;
+    _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    int              vdwjidx1A,vdwjidx1B;
+    _fjsp_v2r8       jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+    int              vdwjidx2A,vdwjidx2B;
+    _fjsp_v2r8       jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+    int              vdwjidx3A,vdwjidx3B;
+    _fjsp_v2r8       jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
+    _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    _fjsp_v2r8       dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+    _fjsp_v2r8       dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+    _fjsp_v2r8       dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
+    _fjsp_v2r8       dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+    _fjsp_v2r8       dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+    _fjsp_v2r8       dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
+    _fjsp_v2r8       dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
+    _fjsp_v2r8       dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
+    _fjsp_v2r8       dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
+    _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
+    _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
+    _fjsp_v2r8       rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF,twovfeps;
+    real             *vftab;
+    _fjsp_v2r8       ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+    real             *ewtab;
+    _fjsp_v2r8       itab_tmp;
+    _fjsp_v2r8       dummy_mask,cutoff_mask;
+    _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+    _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+    union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+
+    vftab            = kernel_data->table_vdw->data;
+    vftabscale       = gmx_fjsp_set1_v2r8(kernel_data->table_vdw->scale);
+
+    sh_ewald         = gmx_fjsp_set1_v2r8(fr->ic->sh_ewald);
+    ewtab            = fr->ic->tabq_coul_FDV0;
+    ewtabscale       = gmx_fjsp_set1_v2r8(fr->ic->tabq_scale);
+    ewtabhalfspace   = gmx_fjsp_set1_v2r8(0.5/fr->ic->tabq_scale);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+    iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+    iq3              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+3]));
+    vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+    jq1              = gmx_fjsp_set1_v2r8(charge[inr+1]);
+    jq2              = gmx_fjsp_set1_v2r8(charge[inr+2]);
+    jq3              = gmx_fjsp_set1_v2r8(charge[inr+3]);
+    vdwjidx0A        = 2*vdwtype[inr+0];
+    c6_00            = gmx_fjsp_set1_v2r8(vdwparam[vdwioffset0+vdwjidx0A]);
+    c12_00           = gmx_fjsp_set1_v2r8(vdwparam[vdwioffset0+vdwjidx0A+1]);
+    qq11             = _fjsp_mul_v2r8(iq1,jq1);
+    qq12             = _fjsp_mul_v2r8(iq1,jq2);
+    qq13             = _fjsp_mul_v2r8(iq1,jq3);
+    qq21             = _fjsp_mul_v2r8(iq2,jq1);
+    qq22             = _fjsp_mul_v2r8(iq2,jq2);
+    qq23             = _fjsp_mul_v2r8(iq2,jq3);
+    qq31             = _fjsp_mul_v2r8(iq3,jq1);
+    qq32             = _fjsp_mul_v2r8(iq3,jq2);
+    qq33             = _fjsp_mul_v2r8(iq3,jq3);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_fjsp_load_shift_and_4rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
+
+        fix0             = _fjsp_setzero_v2r8();
+        fiy0             = _fjsp_setzero_v2r8();
+        fiz0             = _fjsp_setzero_v2r8();
+        fix1             = _fjsp_setzero_v2r8();
+        fiy1             = _fjsp_setzero_v2r8();
+        fiz1             = _fjsp_setzero_v2r8();
+        fix2             = _fjsp_setzero_v2r8();
+        fiy2             = _fjsp_setzero_v2r8();
+        fiz2             = _fjsp_setzero_v2r8();
+        fix3             = _fjsp_setzero_v2r8();
+        fiy3             = _fjsp_setzero_v2r8();
+        fiz3             = _fjsp_setzero_v2r8();
+
+        /* Reset potential sums */
+        velecsum         = _fjsp_setzero_v2r8();
+        vvdwsum          = _fjsp_setzero_v2r8();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_4rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
+                                              &jy2,&jz2,&jx3,&jy3,&jz3);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx11             = _fjsp_sub_v2r8(ix1,jx1);
+            dy11             = _fjsp_sub_v2r8(iy1,jy1);
+            dz11             = _fjsp_sub_v2r8(iz1,jz1);
+            dx12             = _fjsp_sub_v2r8(ix1,jx2);
+            dy12             = _fjsp_sub_v2r8(iy1,jy2);
+            dz12             = _fjsp_sub_v2r8(iz1,jz2);
+            dx13             = _fjsp_sub_v2r8(ix1,jx3);
+            dy13             = _fjsp_sub_v2r8(iy1,jy3);
+            dz13             = _fjsp_sub_v2r8(iz1,jz3);
+            dx21             = _fjsp_sub_v2r8(ix2,jx1);
+            dy21             = _fjsp_sub_v2r8(iy2,jy1);
+            dz21             = _fjsp_sub_v2r8(iz2,jz1);
+            dx22             = _fjsp_sub_v2r8(ix2,jx2);
+            dy22             = _fjsp_sub_v2r8(iy2,jy2);
+            dz22             = _fjsp_sub_v2r8(iz2,jz2);
+            dx23             = _fjsp_sub_v2r8(ix2,jx3);
+            dy23             = _fjsp_sub_v2r8(iy2,jy3);
+            dz23             = _fjsp_sub_v2r8(iz2,jz3);
+            dx31             = _fjsp_sub_v2r8(ix3,jx1);
+            dy31             = _fjsp_sub_v2r8(iy3,jy1);
+            dz31             = _fjsp_sub_v2r8(iz3,jz1);
+            dx32             = _fjsp_sub_v2r8(ix3,jx2);
+            dy32             = _fjsp_sub_v2r8(iy3,jy2);
+            dz32             = _fjsp_sub_v2r8(iz3,jz2);
+            dx33             = _fjsp_sub_v2r8(ix3,jx3);
+            dy33             = _fjsp_sub_v2r8(iy3,jy3);
+            dz33             = _fjsp_sub_v2r8(iz3,jz3);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+            rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+            rsq13            = gmx_fjsp_calc_rsq_v2r8(dx13,dy13,dz13);
+            rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+            rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+            rsq23            = gmx_fjsp_calc_rsq_v2r8(dx23,dy23,dz23);
+            rsq31            = gmx_fjsp_calc_rsq_v2r8(dx31,dy31,dz31);
+            rsq32            = gmx_fjsp_calc_rsq_v2r8(dx32,dy32,dz32);
+            rsq33            = gmx_fjsp_calc_rsq_v2r8(dx33,dy33,dz33);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+            rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+            rinv13           = gmx_fjsp_invsqrt_v2r8(rsq13);
+            rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+            rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+            rinv23           = gmx_fjsp_invsqrt_v2r8(rsq23);
+            rinv31           = gmx_fjsp_invsqrt_v2r8(rsq31);
+            rinv32           = gmx_fjsp_invsqrt_v2r8(rsq32);
+            rinv33           = gmx_fjsp_invsqrt_v2r8(rsq33);
+
+            rinvsq11         = _fjsp_mul_v2r8(rinv11,rinv11);
+            rinvsq12         = _fjsp_mul_v2r8(rinv12,rinv12);
+            rinvsq13         = _fjsp_mul_v2r8(rinv13,rinv13);
+            rinvsq21         = _fjsp_mul_v2r8(rinv21,rinv21);
+            rinvsq22         = _fjsp_mul_v2r8(rinv22,rinv22);
+            rinvsq23         = _fjsp_mul_v2r8(rinv23,rinv23);
+            rinvsq31         = _fjsp_mul_v2r8(rinv31,rinv31);
+            rinvsq32         = _fjsp_mul_v2r8(rinv32,rinv32);
+            rinvsq33         = _fjsp_mul_v2r8(rinv33,rinv33);
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+            fjx1             = _fjsp_setzero_v2r8();
+            fjy1             = _fjsp_setzero_v2r8();
+            fjz1             = _fjsp_setzero_v2r8();
+            fjx2             = _fjsp_setzero_v2r8();
+            fjy2             = _fjsp_setzero_v2r8();
+            fjz2             = _fjsp_setzero_v2r8();
+            fjx3             = _fjsp_setzero_v2r8();
+            fjy3             = _fjsp_setzero_v2r8();
+            fjz3             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r00,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 8;
+            vfconv.i[1]     *= 8;
+
+            /* CUBIC SPLINE TABLE DISPERSION */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 2 );
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 2 );
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            vvdw6            = _fjsp_mul_v2r8(c6_00,VV);
+            FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+            fvdw6            = _fjsp_mul_v2r8(c6_00,FF);
+
+            /* CUBIC SPLINE TABLE REPULSION */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 4 );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 4 );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 6 );
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 6 );
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            vvdw12           = _fjsp_mul_v2r8(c12_00,VV);
+            FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+            fvdw12           = _fjsp_mul_v2r8(c12_00,FF);
+            vvdw             = _fjsp_add_v2r8(vvdw12,vvdw6);
+            fvdw             = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_add_v2r8(fvdw6,fvdw12),_fjsp_mul_v2r8(vftabscale,rinv00)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
+
+            fscal            = fvdw;
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r11              = _fjsp_mul_v2r8(rsq11,rinv11);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r11,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq11,_fjsp_sub_v2r8(rinv11,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq11,rinv11),_fjsp_sub_v2r8(rinvsq11,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+            
+            fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r12              = _fjsp_mul_v2r8(rsq12,rinv12);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r12,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq12,_fjsp_sub_v2r8(rinv12,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq12,rinv12),_fjsp_sub_v2r8(rinvsq12,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+            
+            fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r13              = _fjsp_mul_v2r8(rsq13,rinv13);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r13,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq13,_fjsp_sub_v2r8(rinv13,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq13,rinv13),_fjsp_sub_v2r8(rinvsq13,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx13,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy13,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz13,fscal,fiz1);
+            
+            fjx3             = _fjsp_madd_v2r8(dx13,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy13,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz13,fscal,fjz3);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r21              = _fjsp_mul_v2r8(rsq21,rinv21);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r21,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq21,_fjsp_sub_v2r8(rinv21,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq21,rinv21),_fjsp_sub_v2r8(rinvsq21,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+            
+            fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r22              = _fjsp_mul_v2r8(rsq22,rinv22);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r22,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq22,_fjsp_sub_v2r8(rinv22,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq22,rinv22),_fjsp_sub_v2r8(rinvsq22,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+            
+            fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r23              = _fjsp_mul_v2r8(rsq23,rinv23);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r23,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq23,_fjsp_sub_v2r8(rinv23,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq23,rinv23),_fjsp_sub_v2r8(rinvsq23,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx23,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy23,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz23,fscal,fiz2);
+            
+            fjx3             = _fjsp_madd_v2r8(dx23,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy23,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz23,fscal,fjz3);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r31              = _fjsp_mul_v2r8(rsq31,rinv31);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r31,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq31,_fjsp_sub_v2r8(rinv31,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq31,rinv31),_fjsp_sub_v2r8(rinvsq31,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx31,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy31,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz31,fscal,fiz3);
+            
+            fjx1             = _fjsp_madd_v2r8(dx31,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy31,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz31,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r32              = _fjsp_mul_v2r8(rsq32,rinv32);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r32,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq32,_fjsp_sub_v2r8(rinv32,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq32,rinv32),_fjsp_sub_v2r8(rinvsq32,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx32,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy32,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz32,fscal,fiz3);
+            
+            fjx2             = _fjsp_madd_v2r8(dx32,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy32,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz32,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r33              = _fjsp_mul_v2r8(rsq33,rinv33);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r33,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq33,_fjsp_sub_v2r8(rinv33,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq33,rinv33),_fjsp_sub_v2r8(rinvsq33,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx33,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy33,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz33,fscal,fiz3);
+            
+            fjx3             = _fjsp_madd_v2r8(dx33,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy33,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz33,fscal,fjz3);
+
+            gmx_fjsp_decrement_4rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+
+            /* Inner loop uses 458 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_4rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
+                                              &jy2,&jz2,&jx3,&jy3,&jz3);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx11             = _fjsp_sub_v2r8(ix1,jx1);
+            dy11             = _fjsp_sub_v2r8(iy1,jy1);
+            dz11             = _fjsp_sub_v2r8(iz1,jz1);
+            dx12             = _fjsp_sub_v2r8(ix1,jx2);
+            dy12             = _fjsp_sub_v2r8(iy1,jy2);
+            dz12             = _fjsp_sub_v2r8(iz1,jz2);
+            dx13             = _fjsp_sub_v2r8(ix1,jx3);
+            dy13             = _fjsp_sub_v2r8(iy1,jy3);
+            dz13             = _fjsp_sub_v2r8(iz1,jz3);
+            dx21             = _fjsp_sub_v2r8(ix2,jx1);
+            dy21             = _fjsp_sub_v2r8(iy2,jy1);
+            dz21             = _fjsp_sub_v2r8(iz2,jz1);
+            dx22             = _fjsp_sub_v2r8(ix2,jx2);
+            dy22             = _fjsp_sub_v2r8(iy2,jy2);
+            dz22             = _fjsp_sub_v2r8(iz2,jz2);
+            dx23             = _fjsp_sub_v2r8(ix2,jx3);
+            dy23             = _fjsp_sub_v2r8(iy2,jy3);
+            dz23             = _fjsp_sub_v2r8(iz2,jz3);
+            dx31             = _fjsp_sub_v2r8(ix3,jx1);
+            dy31             = _fjsp_sub_v2r8(iy3,jy1);
+            dz31             = _fjsp_sub_v2r8(iz3,jz1);
+            dx32             = _fjsp_sub_v2r8(ix3,jx2);
+            dy32             = _fjsp_sub_v2r8(iy3,jy2);
+            dz32             = _fjsp_sub_v2r8(iz3,jz2);
+            dx33             = _fjsp_sub_v2r8(ix3,jx3);
+            dy33             = _fjsp_sub_v2r8(iy3,jy3);
+            dz33             = _fjsp_sub_v2r8(iz3,jz3);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+            rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+            rsq13            = gmx_fjsp_calc_rsq_v2r8(dx13,dy13,dz13);
+            rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+            rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+            rsq23            = gmx_fjsp_calc_rsq_v2r8(dx23,dy23,dz23);
+            rsq31            = gmx_fjsp_calc_rsq_v2r8(dx31,dy31,dz31);
+            rsq32            = gmx_fjsp_calc_rsq_v2r8(dx32,dy32,dz32);
+            rsq33            = gmx_fjsp_calc_rsq_v2r8(dx33,dy33,dz33);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+            rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+            rinv13           = gmx_fjsp_invsqrt_v2r8(rsq13);
+            rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+            rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+            rinv23           = gmx_fjsp_invsqrt_v2r8(rsq23);
+            rinv31           = gmx_fjsp_invsqrt_v2r8(rsq31);
+            rinv32           = gmx_fjsp_invsqrt_v2r8(rsq32);
+            rinv33           = gmx_fjsp_invsqrt_v2r8(rsq33);
+
+            rinvsq11         = _fjsp_mul_v2r8(rinv11,rinv11);
+            rinvsq12         = _fjsp_mul_v2r8(rinv12,rinv12);
+            rinvsq13         = _fjsp_mul_v2r8(rinv13,rinv13);
+            rinvsq21         = _fjsp_mul_v2r8(rinv21,rinv21);
+            rinvsq22         = _fjsp_mul_v2r8(rinv22,rinv22);
+            rinvsq23         = _fjsp_mul_v2r8(rinv23,rinv23);
+            rinvsq31         = _fjsp_mul_v2r8(rinv31,rinv31);
+            rinvsq32         = _fjsp_mul_v2r8(rinv32,rinv32);
+            rinvsq33         = _fjsp_mul_v2r8(rinv33,rinv33);
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+            fjx1             = _fjsp_setzero_v2r8();
+            fjy1             = _fjsp_setzero_v2r8();
+            fjz1             = _fjsp_setzero_v2r8();
+            fjx2             = _fjsp_setzero_v2r8();
+            fjy2             = _fjsp_setzero_v2r8();
+            fjz2             = _fjsp_setzero_v2r8();
+            fjx3             = _fjsp_setzero_v2r8();
+            fjy3             = _fjsp_setzero_v2r8();
+            fjz3             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r00,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 8;
+            vfconv.i[1]     *= 8;
+
+            /* CUBIC SPLINE TABLE DISPERSION */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 2 );
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            vvdw6            = _fjsp_mul_v2r8(c6_00,VV);
+            FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+            fvdw6            = _fjsp_mul_v2r8(c6_00,FF);
+
+            /* CUBIC SPLINE TABLE REPULSION */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 4 );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 6 );
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            vvdw12           = _fjsp_mul_v2r8(c12_00,VV);
+            FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+            fvdw12           = _fjsp_mul_v2r8(c12_00,FF);
+            vvdw             = _fjsp_add_v2r8(vvdw12,vvdw6);
+            fvdw             = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_add_v2r8(fvdw6,fvdw12),_fjsp_mul_v2r8(vftabscale,rinv00)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            vvdw             = _fjsp_unpacklo_v2r8(vvdw,_fjsp_setzero_v2r8());
+            vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
+
+            fscal            = fvdw;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r11              = _fjsp_mul_v2r8(rsq11,rinv11);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r11,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq11,_fjsp_sub_v2r8(rinv11,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq11,rinv11),_fjsp_sub_v2r8(rinvsq11,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+            
+            fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r12              = _fjsp_mul_v2r8(rsq12,rinv12);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r12,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq12,_fjsp_sub_v2r8(rinv12,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq12,rinv12),_fjsp_sub_v2r8(rinvsq12,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+            
+            fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r13              = _fjsp_mul_v2r8(rsq13,rinv13);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r13,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq13,_fjsp_sub_v2r8(rinv13,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq13,rinv13),_fjsp_sub_v2r8(rinvsq13,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx13,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy13,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz13,fscal,fiz1);
+            
+            fjx3             = _fjsp_madd_v2r8(dx13,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy13,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz13,fscal,fjz3);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r21              = _fjsp_mul_v2r8(rsq21,rinv21);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r21,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq21,_fjsp_sub_v2r8(rinv21,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq21,rinv21),_fjsp_sub_v2r8(rinvsq21,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+            
+            fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r22              = _fjsp_mul_v2r8(rsq22,rinv22);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r22,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq22,_fjsp_sub_v2r8(rinv22,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq22,rinv22),_fjsp_sub_v2r8(rinvsq22,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+            
+            fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r23              = _fjsp_mul_v2r8(rsq23,rinv23);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r23,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq23,_fjsp_sub_v2r8(rinv23,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq23,rinv23),_fjsp_sub_v2r8(rinvsq23,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx23,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy23,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz23,fscal,fiz2);
+            
+            fjx3             = _fjsp_madd_v2r8(dx23,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy23,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz23,fscal,fjz3);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r31              = _fjsp_mul_v2r8(rsq31,rinv31);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r31,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq31,_fjsp_sub_v2r8(rinv31,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq31,rinv31),_fjsp_sub_v2r8(rinvsq31,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx31,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy31,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz31,fscal,fiz3);
+            
+            fjx1             = _fjsp_madd_v2r8(dx31,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy31,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz31,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r32              = _fjsp_mul_v2r8(rsq32,rinv32);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r32,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq32,_fjsp_sub_v2r8(rinv32,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq32,rinv32),_fjsp_sub_v2r8(rinvsq32,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx32,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy32,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz32,fscal,fiz3);
+            
+            fjx2             = _fjsp_madd_v2r8(dx32,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy32,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz32,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r33              = _fjsp_mul_v2r8(rsq33,rinv33);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r33,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq33,_fjsp_sub_v2r8(rinv33,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq33,rinv33),_fjsp_sub_v2r8(rinvsq33,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx33,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy33,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz33,fscal,fiz3);
+            
+            fjx3             = _fjsp_madd_v2r8(dx33,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy33,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz33,fscal,fjz3);
+
+            gmx_fjsp_decrement_4rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+
+            /* Inner loop uses 458 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_fjsp_update_iforce_4atom_swizzle_v2r8(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        ggid                        = gid[iidx];
+        /* Update potential energies */
+        gmx_fjsp_update_1pot_v2r8(velecsum,kernel_data->energygrp_elec+ggid);
+        gmx_fjsp_update_1pot_v2r8(vvdwsum,kernel_data->energygrp_vdw+ggid);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 26 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_VF,outeriter*26 + inneriter*458);
+}
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecEw_VdwCSTab_GeomW4W4_F_sparc64_hpc_ace_double
+ * Electrostatics interaction: Ewald
+ * VdW interaction:            CubicSplineTable
+ * Geometry:                   Water4-Water4
+ * Calculate force/pot:        Force
+ */
+void
+nb_kernel_ElecEw_VdwCSTab_GeomW4W4_F_sparc64_hpc_ace_double
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwioffset3;
+    _fjsp_v2r8       ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+    int              vdwjidx0A,vdwjidx0B;
+    _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    int              vdwjidx1A,vdwjidx1B;
+    _fjsp_v2r8       jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+    int              vdwjidx2A,vdwjidx2B;
+    _fjsp_v2r8       jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+    int              vdwjidx3A,vdwjidx3B;
+    _fjsp_v2r8       jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
+    _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    _fjsp_v2r8       dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+    _fjsp_v2r8       dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+    _fjsp_v2r8       dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
+    _fjsp_v2r8       dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+    _fjsp_v2r8       dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+    _fjsp_v2r8       dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
+    _fjsp_v2r8       dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
+    _fjsp_v2r8       dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
+    _fjsp_v2r8       dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
+    _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
+    _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
+    _fjsp_v2r8       rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF,twovfeps;
+    real             *vftab;
+    _fjsp_v2r8       ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+    real             *ewtab;
+    _fjsp_v2r8       itab_tmp;
+    _fjsp_v2r8       dummy_mask,cutoff_mask;
+    _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+    _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+    union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+
+    vftab            = kernel_data->table_vdw->data;
+    vftabscale       = gmx_fjsp_set1_v2r8(kernel_data->table_vdw->scale);
+
+    sh_ewald         = gmx_fjsp_set1_v2r8(fr->ic->sh_ewald);
+    ewtab            = fr->ic->tabq_coul_F;
+    ewtabscale       = gmx_fjsp_set1_v2r8(fr->ic->tabq_scale);
+    ewtabhalfspace   = gmx_fjsp_set1_v2r8(0.5/fr->ic->tabq_scale);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+    iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+    iq3              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+3]));
+    vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+    jq1              = gmx_fjsp_set1_v2r8(charge[inr+1]);
+    jq2              = gmx_fjsp_set1_v2r8(charge[inr+2]);
+    jq3              = gmx_fjsp_set1_v2r8(charge[inr+3]);
+    vdwjidx0A        = 2*vdwtype[inr+0];
+    c6_00            = gmx_fjsp_set1_v2r8(vdwparam[vdwioffset0+vdwjidx0A]);
+    c12_00           = gmx_fjsp_set1_v2r8(vdwparam[vdwioffset0+vdwjidx0A+1]);
+    qq11             = _fjsp_mul_v2r8(iq1,jq1);
+    qq12             = _fjsp_mul_v2r8(iq1,jq2);
+    qq13             = _fjsp_mul_v2r8(iq1,jq3);
+    qq21             = _fjsp_mul_v2r8(iq2,jq1);
+    qq22             = _fjsp_mul_v2r8(iq2,jq2);
+    qq23             = _fjsp_mul_v2r8(iq2,jq3);
+    qq31             = _fjsp_mul_v2r8(iq3,jq1);
+    qq32             = _fjsp_mul_v2r8(iq3,jq2);
+    qq33             = _fjsp_mul_v2r8(iq3,jq3);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_fjsp_load_shift_and_4rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
+
+        fix0             = _fjsp_setzero_v2r8();
+        fiy0             = _fjsp_setzero_v2r8();
+        fiz0             = _fjsp_setzero_v2r8();
+        fix1             = _fjsp_setzero_v2r8();
+        fiy1             = _fjsp_setzero_v2r8();
+        fiz1             = _fjsp_setzero_v2r8();
+        fix2             = _fjsp_setzero_v2r8();
+        fiy2             = _fjsp_setzero_v2r8();
+        fiz2             = _fjsp_setzero_v2r8();
+        fix3             = _fjsp_setzero_v2r8();
+        fiy3             = _fjsp_setzero_v2r8();
+        fiz3             = _fjsp_setzero_v2r8();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_4rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
+                                              &jy2,&jz2,&jx3,&jy3,&jz3);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx11             = _fjsp_sub_v2r8(ix1,jx1);
+            dy11             = _fjsp_sub_v2r8(iy1,jy1);
+            dz11             = _fjsp_sub_v2r8(iz1,jz1);
+            dx12             = _fjsp_sub_v2r8(ix1,jx2);
+            dy12             = _fjsp_sub_v2r8(iy1,jy2);
+            dz12             = _fjsp_sub_v2r8(iz1,jz2);
+            dx13             = _fjsp_sub_v2r8(ix1,jx3);
+            dy13             = _fjsp_sub_v2r8(iy1,jy3);
+            dz13             = _fjsp_sub_v2r8(iz1,jz3);
+            dx21             = _fjsp_sub_v2r8(ix2,jx1);
+            dy21             = _fjsp_sub_v2r8(iy2,jy1);
+            dz21             = _fjsp_sub_v2r8(iz2,jz1);
+            dx22             = _fjsp_sub_v2r8(ix2,jx2);
+            dy22             = _fjsp_sub_v2r8(iy2,jy2);
+            dz22             = _fjsp_sub_v2r8(iz2,jz2);
+            dx23             = _fjsp_sub_v2r8(ix2,jx3);
+            dy23             = _fjsp_sub_v2r8(iy2,jy3);
+            dz23             = _fjsp_sub_v2r8(iz2,jz3);
+            dx31             = _fjsp_sub_v2r8(ix3,jx1);
+            dy31             = _fjsp_sub_v2r8(iy3,jy1);
+            dz31             = _fjsp_sub_v2r8(iz3,jz1);
+            dx32             = _fjsp_sub_v2r8(ix3,jx2);
+            dy32             = _fjsp_sub_v2r8(iy3,jy2);
+            dz32             = _fjsp_sub_v2r8(iz3,jz2);
+            dx33             = _fjsp_sub_v2r8(ix3,jx3);
+            dy33             = _fjsp_sub_v2r8(iy3,jy3);
+            dz33             = _fjsp_sub_v2r8(iz3,jz3);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+            rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+            rsq13            = gmx_fjsp_calc_rsq_v2r8(dx13,dy13,dz13);
+            rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+            rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+            rsq23            = gmx_fjsp_calc_rsq_v2r8(dx23,dy23,dz23);
+            rsq31            = gmx_fjsp_calc_rsq_v2r8(dx31,dy31,dz31);
+            rsq32            = gmx_fjsp_calc_rsq_v2r8(dx32,dy32,dz32);
+            rsq33            = gmx_fjsp_calc_rsq_v2r8(dx33,dy33,dz33);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+            rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+            rinv13           = gmx_fjsp_invsqrt_v2r8(rsq13);
+            rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+            rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+            rinv23           = gmx_fjsp_invsqrt_v2r8(rsq23);
+            rinv31           = gmx_fjsp_invsqrt_v2r8(rsq31);
+            rinv32           = gmx_fjsp_invsqrt_v2r8(rsq32);
+            rinv33           = gmx_fjsp_invsqrt_v2r8(rsq33);
+
+            rinvsq11         = _fjsp_mul_v2r8(rinv11,rinv11);
+            rinvsq12         = _fjsp_mul_v2r8(rinv12,rinv12);
+            rinvsq13         = _fjsp_mul_v2r8(rinv13,rinv13);
+            rinvsq21         = _fjsp_mul_v2r8(rinv21,rinv21);
+            rinvsq22         = _fjsp_mul_v2r8(rinv22,rinv22);
+            rinvsq23         = _fjsp_mul_v2r8(rinv23,rinv23);
+            rinvsq31         = _fjsp_mul_v2r8(rinv31,rinv31);
+            rinvsq32         = _fjsp_mul_v2r8(rinv32,rinv32);
+            rinvsq33         = _fjsp_mul_v2r8(rinv33,rinv33);
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+            fjx1             = _fjsp_setzero_v2r8();
+            fjy1             = _fjsp_setzero_v2r8();
+            fjz1             = _fjsp_setzero_v2r8();
+            fjx2             = _fjsp_setzero_v2r8();
+            fjy2             = _fjsp_setzero_v2r8();
+            fjz2             = _fjsp_setzero_v2r8();
+            fjx3             = _fjsp_setzero_v2r8();
+            fjy3             = _fjsp_setzero_v2r8();
+            fjz3             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r00,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 8;
+            vfconv.i[1]     *= 8;
+
+            /* CUBIC SPLINE TABLE DISPERSION */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 2 );
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 2 );
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+            FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+            fvdw6            = _fjsp_mul_v2r8(c6_00,FF);
+
+            /* CUBIC SPLINE TABLE REPULSION */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 4 );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 4 );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 6 );
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 6 );
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+            FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+            fvdw12           = _fjsp_mul_v2r8(c12_00,FF);
+            fvdw             = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_add_v2r8(fvdw6,fvdw12),_fjsp_mul_v2r8(vftabscale,rinv00)));
+
+            fscal            = fvdw;
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r11              = _fjsp_mul_v2r8(rsq11,rinv11);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r11,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
+                                         &ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq11,rinv11),_fjsp_sub_v2r8(rinvsq11,felec));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+            
+            fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r12              = _fjsp_mul_v2r8(rsq12,rinv12);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r12,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
+                                         &ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq12,rinv12),_fjsp_sub_v2r8(rinvsq12,felec));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+            
+            fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r13              = _fjsp_mul_v2r8(rsq13,rinv13);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r13,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
+                                         &ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq13,rinv13),_fjsp_sub_v2r8(rinvsq13,felec));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx13,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy13,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz13,fscal,fiz1);
+            
+            fjx3             = _fjsp_madd_v2r8(dx13,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy13,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz13,fscal,fjz3);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r21              = _fjsp_mul_v2r8(rsq21,rinv21);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r21,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
+                                         &ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq21,rinv21),_fjsp_sub_v2r8(rinvsq21,felec));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+            
+            fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r22              = _fjsp_mul_v2r8(rsq22,rinv22);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r22,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
+                                         &ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq22,rinv22),_fjsp_sub_v2r8(rinvsq22,felec));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+            
+            fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r23              = _fjsp_mul_v2r8(rsq23,rinv23);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r23,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
+                                         &ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq23,rinv23),_fjsp_sub_v2r8(rinvsq23,felec));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx23,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy23,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz23,fscal,fiz2);
+            
+            fjx3             = _fjsp_madd_v2r8(dx23,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy23,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz23,fscal,fjz3);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r31              = _fjsp_mul_v2r8(rsq31,rinv31);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r31,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
+                                         &ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq31,rinv31),_fjsp_sub_v2r8(rinvsq31,felec));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx31,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy31,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz31,fscal,fiz3);
+            
+            fjx1             = _fjsp_madd_v2r8(dx31,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy31,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz31,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r32              = _fjsp_mul_v2r8(rsq32,rinv32);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r32,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
+                                         &ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq32,rinv32),_fjsp_sub_v2r8(rinvsq32,felec));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx32,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy32,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz32,fscal,fiz3);
+            
+            fjx2             = _fjsp_madd_v2r8(dx32,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy32,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz32,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r33              = _fjsp_mul_v2r8(rsq33,rinv33);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r33,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
+                                         &ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq33,rinv33),_fjsp_sub_v2r8(rinvsq33,felec));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx33,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy33,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz33,fscal,fiz3);
+            
+            fjx3             = _fjsp_madd_v2r8(dx33,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy33,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz33,fscal,fjz3);
+
+            gmx_fjsp_decrement_4rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+
+            /* Inner loop uses 405 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_4rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
+                                              &jy2,&jz2,&jx3,&jy3,&jz3);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx11             = _fjsp_sub_v2r8(ix1,jx1);
+            dy11             = _fjsp_sub_v2r8(iy1,jy1);
+            dz11             = _fjsp_sub_v2r8(iz1,jz1);
+            dx12             = _fjsp_sub_v2r8(ix1,jx2);
+            dy12             = _fjsp_sub_v2r8(iy1,jy2);
+            dz12             = _fjsp_sub_v2r8(iz1,jz2);
+            dx13             = _fjsp_sub_v2r8(ix1,jx3);
+            dy13             = _fjsp_sub_v2r8(iy1,jy3);
+            dz13             = _fjsp_sub_v2r8(iz1,jz3);
+            dx21             = _fjsp_sub_v2r8(ix2,jx1);
+            dy21             = _fjsp_sub_v2r8(iy2,jy1);
+            dz21             = _fjsp_sub_v2r8(iz2,jz1);
+            dx22             = _fjsp_sub_v2r8(ix2,jx2);
+            dy22             = _fjsp_sub_v2r8(iy2,jy2);
+            dz22             = _fjsp_sub_v2r8(iz2,jz2);
+            dx23             = _fjsp_sub_v2r8(ix2,jx3);
+            dy23             = _fjsp_sub_v2r8(iy2,jy3);
+            dz23             = _fjsp_sub_v2r8(iz2,jz3);
+            dx31             = _fjsp_sub_v2r8(ix3,jx1);
+            dy31             = _fjsp_sub_v2r8(iy3,jy1);
+            dz31             = _fjsp_sub_v2r8(iz3,jz1);
+            dx32             = _fjsp_sub_v2r8(ix3,jx2);
+            dy32             = _fjsp_sub_v2r8(iy3,jy2);
+            dz32             = _fjsp_sub_v2r8(iz3,jz2);
+            dx33             = _fjsp_sub_v2r8(ix3,jx3);
+            dy33             = _fjsp_sub_v2r8(iy3,jy3);
+            dz33             = _fjsp_sub_v2r8(iz3,jz3);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+            rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+            rsq13            = gmx_fjsp_calc_rsq_v2r8(dx13,dy13,dz13);
+            rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+            rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+            rsq23            = gmx_fjsp_calc_rsq_v2r8(dx23,dy23,dz23);
+            rsq31            = gmx_fjsp_calc_rsq_v2r8(dx31,dy31,dz31);
+            rsq32            = gmx_fjsp_calc_rsq_v2r8(dx32,dy32,dz32);
+            rsq33            = gmx_fjsp_calc_rsq_v2r8(dx33,dy33,dz33);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+            rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+            rinv13           = gmx_fjsp_invsqrt_v2r8(rsq13);
+            rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+            rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+            rinv23           = gmx_fjsp_invsqrt_v2r8(rsq23);
+            rinv31           = gmx_fjsp_invsqrt_v2r8(rsq31);
+            rinv32           = gmx_fjsp_invsqrt_v2r8(rsq32);
+            rinv33           = gmx_fjsp_invsqrt_v2r8(rsq33);
+
+            rinvsq11         = _fjsp_mul_v2r8(rinv11,rinv11);
+            rinvsq12         = _fjsp_mul_v2r8(rinv12,rinv12);
+            rinvsq13         = _fjsp_mul_v2r8(rinv13,rinv13);
+            rinvsq21         = _fjsp_mul_v2r8(rinv21,rinv21);
+            rinvsq22         = _fjsp_mul_v2r8(rinv22,rinv22);
+            rinvsq23         = _fjsp_mul_v2r8(rinv23,rinv23);
+            rinvsq31         = _fjsp_mul_v2r8(rinv31,rinv31);
+            rinvsq32         = _fjsp_mul_v2r8(rinv32,rinv32);
+            rinvsq33         = _fjsp_mul_v2r8(rinv33,rinv33);
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+            fjx1             = _fjsp_setzero_v2r8();
+            fjy1             = _fjsp_setzero_v2r8();
+            fjz1             = _fjsp_setzero_v2r8();
+            fjx2             = _fjsp_setzero_v2r8();
+            fjy2             = _fjsp_setzero_v2r8();
+            fjz2             = _fjsp_setzero_v2r8();
+            fjx3             = _fjsp_setzero_v2r8();
+            fjy3             = _fjsp_setzero_v2r8();
+            fjz3             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r00,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 8;
+            vfconv.i[1]     *= 8;
+
+            /* CUBIC SPLINE TABLE DISPERSION */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 2 );
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+            FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+            fvdw6            = _fjsp_mul_v2r8(c6_00,FF);
+
+            /* CUBIC SPLINE TABLE REPULSION */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 4 );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 6 );
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+            FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+            fvdw12           = _fjsp_mul_v2r8(c12_00,FF);
+            fvdw             = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_add_v2r8(fvdw6,fvdw12),_fjsp_mul_v2r8(vftabscale,rinv00)));
+
+            fscal            = fvdw;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r11              = _fjsp_mul_v2r8(rsq11,rinv11);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r11,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq11,rinv11),_fjsp_sub_v2r8(rinvsq11,felec));
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+            
+            fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r12              = _fjsp_mul_v2r8(rsq12,rinv12);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r12,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq12,rinv12),_fjsp_sub_v2r8(rinvsq12,felec));
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+            
+            fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r13              = _fjsp_mul_v2r8(rsq13,rinv13);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r13,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq13,rinv13),_fjsp_sub_v2r8(rinvsq13,felec));
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx13,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy13,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz13,fscal,fiz1);
+            
+            fjx3             = _fjsp_madd_v2r8(dx13,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy13,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz13,fscal,fjz3);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r21              = _fjsp_mul_v2r8(rsq21,rinv21);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r21,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq21,rinv21),_fjsp_sub_v2r8(rinvsq21,felec));
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+            
+            fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r22              = _fjsp_mul_v2r8(rsq22,rinv22);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r22,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq22,rinv22),_fjsp_sub_v2r8(rinvsq22,felec));
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+            
+            fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r23              = _fjsp_mul_v2r8(rsq23,rinv23);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r23,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq23,rinv23),_fjsp_sub_v2r8(rinvsq23,felec));
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx23,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy23,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz23,fscal,fiz2);
+            
+            fjx3             = _fjsp_madd_v2r8(dx23,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy23,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz23,fscal,fjz3);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r31              = _fjsp_mul_v2r8(rsq31,rinv31);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r31,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq31,rinv31),_fjsp_sub_v2r8(rinvsq31,felec));
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx31,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy31,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz31,fscal,fiz3);
+            
+            fjx1             = _fjsp_madd_v2r8(dx31,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy31,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz31,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r32              = _fjsp_mul_v2r8(rsq32,rinv32);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r32,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq32,rinv32),_fjsp_sub_v2r8(rinvsq32,felec));
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx32,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy32,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz32,fscal,fiz3);
+            
+            fjx2             = _fjsp_madd_v2r8(dx32,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy32,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz32,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r33              = _fjsp_mul_v2r8(rsq33,rinv33);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r33,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq33,rinv33),_fjsp_sub_v2r8(rinvsq33,felec));
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx33,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy33,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz33,fscal,fiz3);
+            
+            fjx3             = _fjsp_madd_v2r8(dx33,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy33,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz33,fscal,fjz3);
+
+            gmx_fjsp_decrement_4rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+
+            /* Inner loop uses 405 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_fjsp_update_iforce_4atom_swizzle_v2r8(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 24 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_F,outeriter*24 + inneriter*405);
+}
diff --git a/src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecEw_VdwLJ_GeomP1P1_sparc64_hpc_ace_double.c b/src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecEw_VdwLJ_GeomP1P1_sparc64_hpc_ace_double.c
new file mode 100644 (file)
index 0000000..eb3a7a1
--- /dev/null
@@ -0,0 +1,614 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2012, by the GROMACS development team, led by
+ * David van der Spoel, Berk Hess, Erik Lindahl, and including many
+ * others, as listed in the AUTHORS file in the top-level source
+ * directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*
+ * Note: this file was generated by the GROMACS sparc64_hpc_ace_double kernel generator.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "kernelutil_sparc64_hpc_ace_double.h"
+
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecEw_VdwLJ_GeomP1P1_VF_sparc64_hpc_ace_double
+ * Electrostatics interaction: Ewald
+ * VdW interaction:            LennardJones
+ * Geometry:                   Particle-Particle
+ * Calculate force/pot:        PotentialAndForce
+ */
+void
+nb_kernel_ElecEw_VdwLJ_GeomP1P1_VF_sparc64_hpc_ace_double
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwjidx0A,vdwjidx0B;
+    _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
+    _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
+    _fjsp_v2r8       ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+    real             *ewtab;
+    _fjsp_v2r8       itab_tmp;
+    _fjsp_v2r8       dummy_mask,cutoff_mask;
+    _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+    _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+    union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+
+    sh_ewald         = gmx_fjsp_set1_v2r8(fr->ic->sh_ewald);
+    ewtab            = fr->ic->tabq_coul_FDV0;
+    ewtabscale       = gmx_fjsp_set1_v2r8(fr->ic->tabq_scale);
+    ewtabhalfspace   = gmx_fjsp_set1_v2r8(0.5/fr->ic->tabq_scale);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_fjsp_load_shift_and_1rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,&ix0,&iy0,&iz0);
+
+        fix0             = _fjsp_setzero_v2r8();
+        fiy0             = _fjsp_setzero_v2r8();
+        fiz0             = _fjsp_setzero_v2r8();
+
+        /* Load parameters for i particles */
+        iq0              = _fjsp_mul_v2r8(facel,gmx_fjsp_load1_v2r8(charge+inr+0));
+        vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+        /* Reset potential sums */
+        velecsum         = _fjsp_setzero_v2r8();
+        vvdwsum          = _fjsp_setzero_v2r8();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_fjsp_load_2real_swizzle_v2r8(charge+jnrA+0,charge+jnrB+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _fjsp_mul_v2r8(iq0,jq0);
+            gmx_fjsp_load_2pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,&c6_00,&c12_00);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r00,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq00,_fjsp_sub_v2r8(rinv00,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq00,rinv00),_fjsp_sub_v2r8(rinvsq00,felec));
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+            vvdw6            = _fjsp_mul_v2r8(c6_00,rinvsix);
+            vvdw12           = _fjsp_mul_v2r8(c12_00,_fjsp_mul_v2r8(rinvsix,rinvsix));
+            vvdw             = _fjsp_msub_v2r8( vvdw12,one_twelfth, _fjsp_mul_v2r8(vvdw6,one_sixth) );
+            fvdw             = _fjsp_mul_v2r8(_fjsp_sub_v2r8(vvdw12,vvdw6),rinvsq00);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+            vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
+
+            fscal            = _fjsp_add_v2r8(felec,fvdw);
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            gmx_fjsp_decrement_fma_1rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fscal,dx00,dy00,dz00);
+
+            /* Inner loop uses 56 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+
+            /* Load parameters for j particles */
+            jq0              = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),charge+jnrA+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _fjsp_mul_v2r8(iq0,jq0);
+            gmx_fjsp_load_1pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,&c6_00,&c12_00);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r00,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq00,_fjsp_sub_v2r8(rinv00,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq00,rinv00),_fjsp_sub_v2r8(rinvsq00,felec));
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+            vvdw6            = _fjsp_mul_v2r8(c6_00,rinvsix);
+            vvdw12           = _fjsp_mul_v2r8(c12_00,_fjsp_mul_v2r8(rinvsix,rinvsix));
+            vvdw             = _fjsp_msub_v2r8( vvdw12,one_twelfth, _fjsp_mul_v2r8(vvdw6,one_sixth) );
+            fvdw             = _fjsp_mul_v2r8(_fjsp_sub_v2r8(vvdw12,vvdw6),rinvsq00);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+            vvdw             = _fjsp_unpacklo_v2r8(vvdw,_fjsp_setzero_v2r8());
+            vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
+
+            fscal            = _fjsp_add_v2r8(felec,fvdw);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            gmx_fjsp_decrement_fma_1rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fscal,dx00,dy00,dz00);
+
+            /* Inner loop uses 56 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_fjsp_update_iforce_1atom_swizzle_v2r8(fix0,fiy0,fiz0,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        ggid                        = gid[iidx];
+        /* Update potential energies */
+        gmx_fjsp_update_1pot_v2r8(velecsum,kernel_data->energygrp_elec+ggid);
+        gmx_fjsp_update_1pot_v2r8(vvdwsum,kernel_data->energygrp_vdw+ggid);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 9 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_VF,outeriter*9 + inneriter*56);
+}
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecEw_VdwLJ_GeomP1P1_F_sparc64_hpc_ace_double
+ * Electrostatics interaction: Ewald
+ * VdW interaction:            LennardJones
+ * Geometry:                   Particle-Particle
+ * Calculate force/pot:        Force
+ */
+void
+nb_kernel_ElecEw_VdwLJ_GeomP1P1_F_sparc64_hpc_ace_double
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwjidx0A,vdwjidx0B;
+    _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
+    _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
+    _fjsp_v2r8       ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+    real             *ewtab;
+    _fjsp_v2r8       itab_tmp;
+    _fjsp_v2r8       dummy_mask,cutoff_mask;
+    _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+    _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+    union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+
+    sh_ewald         = gmx_fjsp_set1_v2r8(fr->ic->sh_ewald);
+    ewtab            = fr->ic->tabq_coul_F;
+    ewtabscale       = gmx_fjsp_set1_v2r8(fr->ic->tabq_scale);
+    ewtabhalfspace   = gmx_fjsp_set1_v2r8(0.5/fr->ic->tabq_scale);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_fjsp_load_shift_and_1rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,&ix0,&iy0,&iz0);
+
+        fix0             = _fjsp_setzero_v2r8();
+        fiy0             = _fjsp_setzero_v2r8();
+        fiz0             = _fjsp_setzero_v2r8();
+
+        /* Load parameters for i particles */
+        iq0              = _fjsp_mul_v2r8(facel,gmx_fjsp_load1_v2r8(charge+inr+0));
+        vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_fjsp_load_2real_swizzle_v2r8(charge+jnrA+0,charge+jnrB+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _fjsp_mul_v2r8(iq0,jq0);
+            gmx_fjsp_load_2pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,&c6_00,&c12_00);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r00,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
+                                         &ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq00,rinv00),_fjsp_sub_v2r8(rinvsq00,felec));
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+            fvdw             = _fjsp_mul_v2r8(_fjsp_msub_v2r8(c12_00,rinvsix,c6_00),_fjsp_mul_v2r8(rinvsix,rinvsq00));
+
+            fscal            = _fjsp_add_v2r8(felec,fvdw);
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            gmx_fjsp_decrement_fma_1rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fscal,dx00,dy00,dz00);
+
+            /* Inner loop uses 46 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+
+            /* Load parameters for j particles */
+            jq0              = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),charge+jnrA+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _fjsp_mul_v2r8(iq0,jq0);
+            gmx_fjsp_load_1pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,&c6_00,&c12_00);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r00,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq00,rinv00),_fjsp_sub_v2r8(rinvsq00,felec));
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+            fvdw             = _fjsp_mul_v2r8(_fjsp_msub_v2r8(c12_00,rinvsix,c6_00),_fjsp_mul_v2r8(rinvsix,rinvsq00));
+
+            fscal            = _fjsp_add_v2r8(felec,fvdw);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            gmx_fjsp_decrement_fma_1rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fscal,dx00,dy00,dz00);
+
+            /* Inner loop uses 46 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_fjsp_update_iforce_1atom_swizzle_v2r8(fix0,fiy0,fiz0,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 7 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_F,outeriter*7 + inneriter*46);
+}
diff --git a/src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecEw_VdwLJ_GeomW3P1_sparc64_hpc_ace_double.c b/src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecEw_VdwLJ_GeomW3P1_sparc64_hpc_ace_double.c
new file mode 100644 (file)
index 0000000..b82d07d
--- /dev/null
@@ -0,0 +1,1034 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2012, by the GROMACS development team, led by
+ * David van der Spoel, Berk Hess, Erik Lindahl, and including many
+ * others, as listed in the AUTHORS file in the top-level source
+ * directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*
+ * Note: this file was generated by the GROMACS sparc64_hpc_ace_double kernel generator.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "kernelutil_sparc64_hpc_ace_double.h"
+
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecEw_VdwLJ_GeomW3P1_VF_sparc64_hpc_ace_double
+ * Electrostatics interaction: Ewald
+ * VdW interaction:            LennardJones
+ * Geometry:                   Water3-Particle
+ * Calculate force/pot:        PotentialAndForce
+ */
+void
+nb_kernel_ElecEw_VdwLJ_GeomW3P1_VF_sparc64_hpc_ace_double
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwjidx0A,vdwjidx0B;
+    _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    _fjsp_v2r8       dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    _fjsp_v2r8       dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
+    _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
+    _fjsp_v2r8       ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+    real             *ewtab;
+    _fjsp_v2r8       itab_tmp;
+    _fjsp_v2r8       dummy_mask,cutoff_mask;
+    _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+    _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+    union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+
+    sh_ewald         = gmx_fjsp_set1_v2r8(fr->ic->sh_ewald);
+    ewtab            = fr->ic->tabq_coul_FDV0;
+    ewtabscale       = gmx_fjsp_set1_v2r8(fr->ic->tabq_scale);
+    ewtabhalfspace   = gmx_fjsp_set1_v2r8(0.5/fr->ic->tabq_scale);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq0              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+0]));
+    iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+    iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+    vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_fjsp_load_shift_and_3rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
+
+        fix0             = _fjsp_setzero_v2r8();
+        fiy0             = _fjsp_setzero_v2r8();
+        fiz0             = _fjsp_setzero_v2r8();
+        fix1             = _fjsp_setzero_v2r8();
+        fiy1             = _fjsp_setzero_v2r8();
+        fiz1             = _fjsp_setzero_v2r8();
+        fix2             = _fjsp_setzero_v2r8();
+        fiy2             = _fjsp_setzero_v2r8();
+        fiz2             = _fjsp_setzero_v2r8();
+
+        /* Reset potential sums */
+        velecsum         = _fjsp_setzero_v2r8();
+        vvdwsum          = _fjsp_setzero_v2r8();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+            rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+            rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_fjsp_load_2real_swizzle_v2r8(charge+jnrA+0,charge+jnrB+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _fjsp_mul_v2r8(iq0,jq0);
+            gmx_fjsp_load_2pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,&c6_00,&c12_00);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r00,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq00,_fjsp_sub_v2r8(rinv00,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq00,rinv00),_fjsp_sub_v2r8(rinvsq00,felec));
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+            vvdw6            = _fjsp_mul_v2r8(c6_00,rinvsix);
+            vvdw12           = _fjsp_mul_v2r8(c12_00,_fjsp_mul_v2r8(rinvsix,rinvsix));
+            vvdw             = _fjsp_msub_v2r8( vvdw12,one_twelfth, _fjsp_mul_v2r8(vvdw6,one_sixth) );
+            fvdw             = _fjsp_mul_v2r8(_fjsp_sub_v2r8(vvdw12,vvdw6),rinvsq00);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+            vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
+
+            fscal            = _fjsp_add_v2r8(felec,fvdw);
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r10              = _fjsp_mul_v2r8(rsq10,rinv10);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _fjsp_mul_v2r8(iq1,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r10,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq10,_fjsp_sub_v2r8(rinv10,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq10,rinv10),_fjsp_sub_v2r8(rinvsq10,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r20              = _fjsp_mul_v2r8(rsq20,rinv20);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _fjsp_mul_v2r8(iq2,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r20,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq20,_fjsp_sub_v2r8(rinv20,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq20,rinv20),_fjsp_sub_v2r8(rinvsq20,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            gmx_fjsp_decrement_1rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 147 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+            rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+            rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+
+            /* Load parameters for j particles */
+            jq0              = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),charge+jnrA+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _fjsp_mul_v2r8(iq0,jq0);
+            gmx_fjsp_load_1pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,&c6_00,&c12_00);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r00,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq00,_fjsp_sub_v2r8(rinv00,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq00,rinv00),_fjsp_sub_v2r8(rinvsq00,felec));
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+            vvdw6            = _fjsp_mul_v2r8(c6_00,rinvsix);
+            vvdw12           = _fjsp_mul_v2r8(c12_00,_fjsp_mul_v2r8(rinvsix,rinvsix));
+            vvdw             = _fjsp_msub_v2r8( vvdw12,one_twelfth, _fjsp_mul_v2r8(vvdw6,one_sixth) );
+            fvdw             = _fjsp_mul_v2r8(_fjsp_sub_v2r8(vvdw12,vvdw6),rinvsq00);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+            vvdw             = _fjsp_unpacklo_v2r8(vvdw,_fjsp_setzero_v2r8());
+            vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
+
+            fscal            = _fjsp_add_v2r8(felec,fvdw);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r10              = _fjsp_mul_v2r8(rsq10,rinv10);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _fjsp_mul_v2r8(iq1,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r10,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq10,_fjsp_sub_v2r8(rinv10,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq10,rinv10),_fjsp_sub_v2r8(rinvsq10,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r20              = _fjsp_mul_v2r8(rsq20,rinv20);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _fjsp_mul_v2r8(iq2,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r20,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq20,_fjsp_sub_v2r8(rinv20,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq20,rinv20),_fjsp_sub_v2r8(rinvsq20,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            gmx_fjsp_decrement_1rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 147 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_fjsp_update_iforce_3atom_swizzle_v2r8(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        ggid                        = gid[iidx];
+        /* Update potential energies */
+        gmx_fjsp_update_1pot_v2r8(velecsum,kernel_data->energygrp_elec+ggid);
+        gmx_fjsp_update_1pot_v2r8(vvdwsum,kernel_data->energygrp_vdw+ggid);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 20 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3_VF,outeriter*20 + inneriter*147);
+}
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecEw_VdwLJ_GeomW3P1_F_sparc64_hpc_ace_double
+ * Electrostatics interaction: Ewald
+ * VdW interaction:            LennardJones
+ * Geometry:                   Water3-Particle
+ * Calculate force/pot:        Force
+ */
+void
+nb_kernel_ElecEw_VdwLJ_GeomW3P1_F_sparc64_hpc_ace_double
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwjidx0A,vdwjidx0B;
+    _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    _fjsp_v2r8       dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    _fjsp_v2r8       dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
+    _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
+    _fjsp_v2r8       ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+    real             *ewtab;
+    _fjsp_v2r8       itab_tmp;
+    _fjsp_v2r8       dummy_mask,cutoff_mask;
+    _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+    _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+    union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+
+    sh_ewald         = gmx_fjsp_set1_v2r8(fr->ic->sh_ewald);
+    ewtab            = fr->ic->tabq_coul_F;
+    ewtabscale       = gmx_fjsp_set1_v2r8(fr->ic->tabq_scale);
+    ewtabhalfspace   = gmx_fjsp_set1_v2r8(0.5/fr->ic->tabq_scale);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq0              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+0]));
+    iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+    iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+    vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_fjsp_load_shift_and_3rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
+
+        fix0             = _fjsp_setzero_v2r8();
+        fiy0             = _fjsp_setzero_v2r8();
+        fiz0             = _fjsp_setzero_v2r8();
+        fix1             = _fjsp_setzero_v2r8();
+        fiy1             = _fjsp_setzero_v2r8();
+        fiz1             = _fjsp_setzero_v2r8();
+        fix2             = _fjsp_setzero_v2r8();
+        fiy2             = _fjsp_setzero_v2r8();
+        fiz2             = _fjsp_setzero_v2r8();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+            rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+            rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_fjsp_load_2real_swizzle_v2r8(charge+jnrA+0,charge+jnrB+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _fjsp_mul_v2r8(iq0,jq0);
+            gmx_fjsp_load_2pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,&c6_00,&c12_00);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r00,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
+                                         &ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq00,rinv00),_fjsp_sub_v2r8(rinvsq00,felec));
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+            fvdw             = _fjsp_mul_v2r8(_fjsp_msub_v2r8(c12_00,rinvsix,c6_00),_fjsp_mul_v2r8(rinvsix,rinvsq00));
+
+            fscal            = _fjsp_add_v2r8(felec,fvdw);
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r10              = _fjsp_mul_v2r8(rsq10,rinv10);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _fjsp_mul_v2r8(iq1,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r10,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
+                                         &ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq10,rinv10),_fjsp_sub_v2r8(rinvsq10,felec));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r20              = _fjsp_mul_v2r8(rsq20,rinv20);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _fjsp_mul_v2r8(iq2,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r20,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
+                                         &ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq20,rinv20),_fjsp_sub_v2r8(rinvsq20,felec));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            gmx_fjsp_decrement_1rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 127 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+            rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+            rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+
+            /* Load parameters for j particles */
+            jq0              = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),charge+jnrA+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _fjsp_mul_v2r8(iq0,jq0);
+            gmx_fjsp_load_1pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,&c6_00,&c12_00);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r00,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq00,rinv00),_fjsp_sub_v2r8(rinvsq00,felec));
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+            fvdw             = _fjsp_mul_v2r8(_fjsp_msub_v2r8(c12_00,rinvsix,c6_00),_fjsp_mul_v2r8(rinvsix,rinvsq00));
+
+            fscal            = _fjsp_add_v2r8(felec,fvdw);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r10              = _fjsp_mul_v2r8(rsq10,rinv10);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _fjsp_mul_v2r8(iq1,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r10,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq10,rinv10),_fjsp_sub_v2r8(rinvsq10,felec));
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r20              = _fjsp_mul_v2r8(rsq20,rinv20);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _fjsp_mul_v2r8(iq2,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r20,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq20,rinv20),_fjsp_sub_v2r8(rinvsq20,felec));
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            gmx_fjsp_decrement_1rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 127 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_fjsp_update_iforce_3atom_swizzle_v2r8(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 18 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3_F,outeriter*18 + inneriter*127);
+}
diff --git a/src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecEw_VdwLJ_GeomW3W3_sparc64_hpc_ace_double.c b/src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecEw_VdwLJ_GeomW3W3_sparc64_hpc_ace_double.c
new file mode 100644 (file)
index 0000000..a4d4afe
--- /dev/null
@@ -0,0 +1,2046 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2012, by the GROMACS development team, led by
+ * David van der Spoel, Berk Hess, Erik Lindahl, and including many
+ * others, as listed in the AUTHORS file in the top-level source
+ * directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*
+ * Note: this file was generated by the GROMACS sparc64_hpc_ace_double kernel generator.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "kernelutil_sparc64_hpc_ace_double.h"
+
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecEw_VdwLJ_GeomW3W3_VF_sparc64_hpc_ace_double
+ * Electrostatics interaction: Ewald
+ * VdW interaction:            LennardJones
+ * Geometry:                   Water3-Water3
+ * Calculate force/pot:        PotentialAndForce
+ */
+void
+nb_kernel_ElecEw_VdwLJ_GeomW3W3_VF_sparc64_hpc_ace_double
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwjidx0A,vdwjidx0B;
+    _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    int              vdwjidx1A,vdwjidx1B;
+    _fjsp_v2r8       jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+    int              vdwjidx2A,vdwjidx2B;
+    _fjsp_v2r8       jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+    _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    _fjsp_v2r8       dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
+    _fjsp_v2r8       dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
+    _fjsp_v2r8       dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    _fjsp_v2r8       dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+    _fjsp_v2r8       dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+    _fjsp_v2r8       dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    _fjsp_v2r8       dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+    _fjsp_v2r8       dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+    _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
+    _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
+    _fjsp_v2r8       ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+    real             *ewtab;
+    _fjsp_v2r8       itab_tmp;
+    _fjsp_v2r8       dummy_mask,cutoff_mask;
+    _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+    _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+    union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+
+    sh_ewald         = gmx_fjsp_set1_v2r8(fr->ic->sh_ewald);
+    ewtab            = fr->ic->tabq_coul_FDV0;
+    ewtabscale       = gmx_fjsp_set1_v2r8(fr->ic->tabq_scale);
+    ewtabhalfspace   = gmx_fjsp_set1_v2r8(0.5/fr->ic->tabq_scale);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq0              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+0]));
+    iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+    iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+    vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+    jq0              = gmx_fjsp_set1_v2r8(charge[inr+0]);
+    jq1              = gmx_fjsp_set1_v2r8(charge[inr+1]);
+    jq2              = gmx_fjsp_set1_v2r8(charge[inr+2]);
+    vdwjidx0A        = 2*vdwtype[inr+0];
+    qq00             = _fjsp_mul_v2r8(iq0,jq0);
+    c6_00            = gmx_fjsp_set1_v2r8(vdwparam[vdwioffset0+vdwjidx0A]);
+    c12_00           = gmx_fjsp_set1_v2r8(vdwparam[vdwioffset0+vdwjidx0A+1]);
+    qq01             = _fjsp_mul_v2r8(iq0,jq1);
+    qq02             = _fjsp_mul_v2r8(iq0,jq2);
+    qq10             = _fjsp_mul_v2r8(iq1,jq0);
+    qq11             = _fjsp_mul_v2r8(iq1,jq1);
+    qq12             = _fjsp_mul_v2r8(iq1,jq2);
+    qq20             = _fjsp_mul_v2r8(iq2,jq0);
+    qq21             = _fjsp_mul_v2r8(iq2,jq1);
+    qq22             = _fjsp_mul_v2r8(iq2,jq2);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_fjsp_load_shift_and_3rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
+
+        fix0             = _fjsp_setzero_v2r8();
+        fiy0             = _fjsp_setzero_v2r8();
+        fiz0             = _fjsp_setzero_v2r8();
+        fix1             = _fjsp_setzero_v2r8();
+        fiy1             = _fjsp_setzero_v2r8();
+        fiz1             = _fjsp_setzero_v2r8();
+        fix2             = _fjsp_setzero_v2r8();
+        fiy2             = _fjsp_setzero_v2r8();
+        fiz2             = _fjsp_setzero_v2r8();
+
+        /* Reset potential sums */
+        velecsum         = _fjsp_setzero_v2r8();
+        vvdwsum          = _fjsp_setzero_v2r8();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_3rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx01             = _fjsp_sub_v2r8(ix0,jx1);
+            dy01             = _fjsp_sub_v2r8(iy0,jy1);
+            dz01             = _fjsp_sub_v2r8(iz0,jz1);
+            dx02             = _fjsp_sub_v2r8(ix0,jx2);
+            dy02             = _fjsp_sub_v2r8(iy0,jy2);
+            dz02             = _fjsp_sub_v2r8(iz0,jz2);
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx11             = _fjsp_sub_v2r8(ix1,jx1);
+            dy11             = _fjsp_sub_v2r8(iy1,jy1);
+            dz11             = _fjsp_sub_v2r8(iz1,jz1);
+            dx12             = _fjsp_sub_v2r8(ix1,jx2);
+            dy12             = _fjsp_sub_v2r8(iy1,jy2);
+            dz12             = _fjsp_sub_v2r8(iz1,jz2);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+            dx21             = _fjsp_sub_v2r8(ix2,jx1);
+            dy21             = _fjsp_sub_v2r8(iy2,jy1);
+            dz21             = _fjsp_sub_v2r8(iz2,jz1);
+            dx22             = _fjsp_sub_v2r8(ix2,jx2);
+            dy22             = _fjsp_sub_v2r8(iy2,jy2);
+            dz22             = _fjsp_sub_v2r8(iz2,jz2);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq01            = gmx_fjsp_calc_rsq_v2r8(dx01,dy01,dz01);
+            rsq02            = gmx_fjsp_calc_rsq_v2r8(dx02,dy02,dz02);
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+            rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+            rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+            rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv01           = gmx_fjsp_invsqrt_v2r8(rsq01);
+            rinv02           = gmx_fjsp_invsqrt_v2r8(rsq02);
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+            rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+            rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+            rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+            rinvsq01         = _fjsp_mul_v2r8(rinv01,rinv01);
+            rinvsq02         = _fjsp_mul_v2r8(rinv02,rinv02);
+            rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+            rinvsq11         = _fjsp_mul_v2r8(rinv11,rinv11);
+            rinvsq12         = _fjsp_mul_v2r8(rinv12,rinv12);
+            rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+            rinvsq21         = _fjsp_mul_v2r8(rinv21,rinv21);
+            rinvsq22         = _fjsp_mul_v2r8(rinv22,rinv22);
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+            fjx1             = _fjsp_setzero_v2r8();
+            fjy1             = _fjsp_setzero_v2r8();
+            fjz1             = _fjsp_setzero_v2r8();
+            fjx2             = _fjsp_setzero_v2r8();
+            fjy2             = _fjsp_setzero_v2r8();
+            fjz2             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r00,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq00,_fjsp_sub_v2r8(rinv00,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq00,rinv00),_fjsp_sub_v2r8(rinvsq00,felec));
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+            vvdw6            = _fjsp_mul_v2r8(c6_00,rinvsix);
+            vvdw12           = _fjsp_mul_v2r8(c12_00,_fjsp_mul_v2r8(rinvsix,rinvsix));
+            vvdw             = _fjsp_msub_v2r8( vvdw12,one_twelfth, _fjsp_mul_v2r8(vvdw6,one_sixth) );
+            fvdw             = _fjsp_mul_v2r8(_fjsp_sub_v2r8(vvdw12,vvdw6),rinvsq00);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+            vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
+
+            fscal            = _fjsp_add_v2r8(felec,fvdw);
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r01              = _fjsp_mul_v2r8(rsq01,rinv01);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r01,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq01,_fjsp_sub_v2r8(rinv01,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq01,rinv01),_fjsp_sub_v2r8(rinvsq01,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx01,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy01,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz01,fscal,fiz0);
+            
+            fjx1             = _fjsp_madd_v2r8(dx01,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy01,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz01,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r02              = _fjsp_mul_v2r8(rsq02,rinv02);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r02,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq02,_fjsp_sub_v2r8(rinv02,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq02,rinv02),_fjsp_sub_v2r8(rinvsq02,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx02,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy02,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz02,fscal,fiz0);
+            
+            fjx2             = _fjsp_madd_v2r8(dx02,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy02,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz02,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r10              = _fjsp_mul_v2r8(rsq10,rinv10);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r10,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq10,_fjsp_sub_v2r8(rinv10,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq10,rinv10),_fjsp_sub_v2r8(rinvsq10,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r11              = _fjsp_mul_v2r8(rsq11,rinv11);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r11,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq11,_fjsp_sub_v2r8(rinv11,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq11,rinv11),_fjsp_sub_v2r8(rinvsq11,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+            
+            fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r12              = _fjsp_mul_v2r8(rsq12,rinv12);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r12,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq12,_fjsp_sub_v2r8(rinv12,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq12,rinv12),_fjsp_sub_v2r8(rinvsq12,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+            
+            fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r20              = _fjsp_mul_v2r8(rsq20,rinv20);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r20,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq20,_fjsp_sub_v2r8(rinv20,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq20,rinv20),_fjsp_sub_v2r8(rinvsq20,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r21              = _fjsp_mul_v2r8(rsq21,rinv21);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r21,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq21,_fjsp_sub_v2r8(rinv21,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq21,rinv21),_fjsp_sub_v2r8(rinvsq21,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+            
+            fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r22              = _fjsp_mul_v2r8(rsq22,rinv22);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r22,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq22,_fjsp_sub_v2r8(rinv22,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq22,rinv22),_fjsp_sub_v2r8(rinvsq22,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+            
+            fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+
+            gmx_fjsp_decrement_3rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+
+            /* Inner loop uses 408 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_3rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx01             = _fjsp_sub_v2r8(ix0,jx1);
+            dy01             = _fjsp_sub_v2r8(iy0,jy1);
+            dz01             = _fjsp_sub_v2r8(iz0,jz1);
+            dx02             = _fjsp_sub_v2r8(ix0,jx2);
+            dy02             = _fjsp_sub_v2r8(iy0,jy2);
+            dz02             = _fjsp_sub_v2r8(iz0,jz2);
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx11             = _fjsp_sub_v2r8(ix1,jx1);
+            dy11             = _fjsp_sub_v2r8(iy1,jy1);
+            dz11             = _fjsp_sub_v2r8(iz1,jz1);
+            dx12             = _fjsp_sub_v2r8(ix1,jx2);
+            dy12             = _fjsp_sub_v2r8(iy1,jy2);
+            dz12             = _fjsp_sub_v2r8(iz1,jz2);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+            dx21             = _fjsp_sub_v2r8(ix2,jx1);
+            dy21             = _fjsp_sub_v2r8(iy2,jy1);
+            dz21             = _fjsp_sub_v2r8(iz2,jz1);
+            dx22             = _fjsp_sub_v2r8(ix2,jx2);
+            dy22             = _fjsp_sub_v2r8(iy2,jy2);
+            dz22             = _fjsp_sub_v2r8(iz2,jz2);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq01            = gmx_fjsp_calc_rsq_v2r8(dx01,dy01,dz01);
+            rsq02            = gmx_fjsp_calc_rsq_v2r8(dx02,dy02,dz02);
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+            rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+            rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+            rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv01           = gmx_fjsp_invsqrt_v2r8(rsq01);
+            rinv02           = gmx_fjsp_invsqrt_v2r8(rsq02);
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+            rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+            rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+            rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+            rinvsq01         = _fjsp_mul_v2r8(rinv01,rinv01);
+            rinvsq02         = _fjsp_mul_v2r8(rinv02,rinv02);
+            rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+            rinvsq11         = _fjsp_mul_v2r8(rinv11,rinv11);
+            rinvsq12         = _fjsp_mul_v2r8(rinv12,rinv12);
+            rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+            rinvsq21         = _fjsp_mul_v2r8(rinv21,rinv21);
+            rinvsq22         = _fjsp_mul_v2r8(rinv22,rinv22);
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+            fjx1             = _fjsp_setzero_v2r8();
+            fjy1             = _fjsp_setzero_v2r8();
+            fjz1             = _fjsp_setzero_v2r8();
+            fjx2             = _fjsp_setzero_v2r8();
+            fjy2             = _fjsp_setzero_v2r8();
+            fjz2             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r00,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq00,_fjsp_sub_v2r8(rinv00,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq00,rinv00),_fjsp_sub_v2r8(rinvsq00,felec));
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+            vvdw6            = _fjsp_mul_v2r8(c6_00,rinvsix);
+            vvdw12           = _fjsp_mul_v2r8(c12_00,_fjsp_mul_v2r8(rinvsix,rinvsix));
+            vvdw             = _fjsp_msub_v2r8( vvdw12,one_twelfth, _fjsp_mul_v2r8(vvdw6,one_sixth) );
+            fvdw             = _fjsp_mul_v2r8(_fjsp_sub_v2r8(vvdw12,vvdw6),rinvsq00);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+            vvdw             = _fjsp_unpacklo_v2r8(vvdw,_fjsp_setzero_v2r8());
+            vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
+
+            fscal            = _fjsp_add_v2r8(felec,fvdw);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r01              = _fjsp_mul_v2r8(rsq01,rinv01);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r01,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq01,_fjsp_sub_v2r8(rinv01,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq01,rinv01),_fjsp_sub_v2r8(rinvsq01,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx01,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy01,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz01,fscal,fiz0);
+            
+            fjx1             = _fjsp_madd_v2r8(dx01,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy01,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz01,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r02              = _fjsp_mul_v2r8(rsq02,rinv02);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r02,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq02,_fjsp_sub_v2r8(rinv02,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq02,rinv02),_fjsp_sub_v2r8(rinvsq02,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx02,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy02,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz02,fscal,fiz0);
+            
+            fjx2             = _fjsp_madd_v2r8(dx02,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy02,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz02,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r10              = _fjsp_mul_v2r8(rsq10,rinv10);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r10,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq10,_fjsp_sub_v2r8(rinv10,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq10,rinv10),_fjsp_sub_v2r8(rinvsq10,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r11              = _fjsp_mul_v2r8(rsq11,rinv11);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r11,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq11,_fjsp_sub_v2r8(rinv11,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq11,rinv11),_fjsp_sub_v2r8(rinvsq11,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+            
+            fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r12              = _fjsp_mul_v2r8(rsq12,rinv12);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r12,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq12,_fjsp_sub_v2r8(rinv12,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq12,rinv12),_fjsp_sub_v2r8(rinvsq12,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+            
+            fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r20              = _fjsp_mul_v2r8(rsq20,rinv20);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r20,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq20,_fjsp_sub_v2r8(rinv20,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq20,rinv20),_fjsp_sub_v2r8(rinvsq20,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r21              = _fjsp_mul_v2r8(rsq21,rinv21);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r21,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq21,_fjsp_sub_v2r8(rinv21,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq21,rinv21),_fjsp_sub_v2r8(rinvsq21,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+            
+            fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r22              = _fjsp_mul_v2r8(rsq22,rinv22);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r22,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq22,_fjsp_sub_v2r8(rinv22,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq22,rinv22),_fjsp_sub_v2r8(rinvsq22,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+            
+            fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+
+            gmx_fjsp_decrement_3rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+
+            /* Inner loop uses 408 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_fjsp_update_iforce_3atom_swizzle_v2r8(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        ggid                        = gid[iidx];
+        /* Update potential energies */
+        gmx_fjsp_update_1pot_v2r8(velecsum,kernel_data->energygrp_elec+ggid);
+        gmx_fjsp_update_1pot_v2r8(vvdwsum,kernel_data->energygrp_vdw+ggid);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 20 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_VF,outeriter*20 + inneriter*408);
+}
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecEw_VdwLJ_GeomW3W3_F_sparc64_hpc_ace_double
+ * Electrostatics interaction: Ewald
+ * VdW interaction:            LennardJones
+ * Geometry:                   Water3-Water3
+ * Calculate force/pot:        Force
+ */
+void
+nb_kernel_ElecEw_VdwLJ_GeomW3W3_F_sparc64_hpc_ace_double
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwjidx0A,vdwjidx0B;
+    _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    int              vdwjidx1A,vdwjidx1B;
+    _fjsp_v2r8       jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+    int              vdwjidx2A,vdwjidx2B;
+    _fjsp_v2r8       jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+    _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    _fjsp_v2r8       dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
+    _fjsp_v2r8       dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
+    _fjsp_v2r8       dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    _fjsp_v2r8       dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+    _fjsp_v2r8       dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+    _fjsp_v2r8       dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    _fjsp_v2r8       dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+    _fjsp_v2r8       dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+    _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
+    _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
+    _fjsp_v2r8       ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+    real             *ewtab;
+    _fjsp_v2r8       itab_tmp;
+    _fjsp_v2r8       dummy_mask,cutoff_mask;
+    _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+    _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+    union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+
+    sh_ewald         = gmx_fjsp_set1_v2r8(fr->ic->sh_ewald);
+    ewtab            = fr->ic->tabq_coul_F;
+    ewtabscale       = gmx_fjsp_set1_v2r8(fr->ic->tabq_scale);
+    ewtabhalfspace   = gmx_fjsp_set1_v2r8(0.5/fr->ic->tabq_scale);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq0              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+0]));
+    iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+    iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+    vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+    jq0              = gmx_fjsp_set1_v2r8(charge[inr+0]);
+    jq1              = gmx_fjsp_set1_v2r8(charge[inr+1]);
+    jq2              = gmx_fjsp_set1_v2r8(charge[inr+2]);
+    vdwjidx0A        = 2*vdwtype[inr+0];
+    qq00             = _fjsp_mul_v2r8(iq0,jq0);
+    c6_00            = gmx_fjsp_set1_v2r8(vdwparam[vdwioffset0+vdwjidx0A]);
+    c12_00           = gmx_fjsp_set1_v2r8(vdwparam[vdwioffset0+vdwjidx0A+1]);
+    qq01             = _fjsp_mul_v2r8(iq0,jq1);
+    qq02             = _fjsp_mul_v2r8(iq0,jq2);
+    qq10             = _fjsp_mul_v2r8(iq1,jq0);
+    qq11             = _fjsp_mul_v2r8(iq1,jq1);
+    qq12             = _fjsp_mul_v2r8(iq1,jq2);
+    qq20             = _fjsp_mul_v2r8(iq2,jq0);
+    qq21             = _fjsp_mul_v2r8(iq2,jq1);
+    qq22             = _fjsp_mul_v2r8(iq2,jq2);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_fjsp_load_shift_and_3rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
+
+        fix0             = _fjsp_setzero_v2r8();
+        fiy0             = _fjsp_setzero_v2r8();
+        fiz0             = _fjsp_setzero_v2r8();
+        fix1             = _fjsp_setzero_v2r8();
+        fiy1             = _fjsp_setzero_v2r8();
+        fiz1             = _fjsp_setzero_v2r8();
+        fix2             = _fjsp_setzero_v2r8();
+        fiy2             = _fjsp_setzero_v2r8();
+        fiz2             = _fjsp_setzero_v2r8();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_3rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx01             = _fjsp_sub_v2r8(ix0,jx1);
+            dy01             = _fjsp_sub_v2r8(iy0,jy1);
+            dz01             = _fjsp_sub_v2r8(iz0,jz1);
+            dx02             = _fjsp_sub_v2r8(ix0,jx2);
+            dy02             = _fjsp_sub_v2r8(iy0,jy2);
+            dz02             = _fjsp_sub_v2r8(iz0,jz2);
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx11             = _fjsp_sub_v2r8(ix1,jx1);
+            dy11             = _fjsp_sub_v2r8(iy1,jy1);
+            dz11             = _fjsp_sub_v2r8(iz1,jz1);
+            dx12             = _fjsp_sub_v2r8(ix1,jx2);
+            dy12             = _fjsp_sub_v2r8(iy1,jy2);
+            dz12             = _fjsp_sub_v2r8(iz1,jz2);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+            dx21             = _fjsp_sub_v2r8(ix2,jx1);
+            dy21             = _fjsp_sub_v2r8(iy2,jy1);
+            dz21             = _fjsp_sub_v2r8(iz2,jz1);
+            dx22             = _fjsp_sub_v2r8(ix2,jx2);
+            dy22             = _fjsp_sub_v2r8(iy2,jy2);
+            dz22             = _fjsp_sub_v2r8(iz2,jz2);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq01            = gmx_fjsp_calc_rsq_v2r8(dx01,dy01,dz01);
+            rsq02            = gmx_fjsp_calc_rsq_v2r8(dx02,dy02,dz02);
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+            rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+            rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+            rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv01           = gmx_fjsp_invsqrt_v2r8(rsq01);
+            rinv02           = gmx_fjsp_invsqrt_v2r8(rsq02);
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+            rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+            rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+            rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+            rinvsq01         = _fjsp_mul_v2r8(rinv01,rinv01);
+            rinvsq02         = _fjsp_mul_v2r8(rinv02,rinv02);
+            rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+            rinvsq11         = _fjsp_mul_v2r8(rinv11,rinv11);
+            rinvsq12         = _fjsp_mul_v2r8(rinv12,rinv12);
+            rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+            rinvsq21         = _fjsp_mul_v2r8(rinv21,rinv21);
+            rinvsq22         = _fjsp_mul_v2r8(rinv22,rinv22);
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+            fjx1             = _fjsp_setzero_v2r8();
+            fjy1             = _fjsp_setzero_v2r8();
+            fjz1             = _fjsp_setzero_v2r8();
+            fjx2             = _fjsp_setzero_v2r8();
+            fjy2             = _fjsp_setzero_v2r8();
+            fjz2             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r00,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
+                                         &ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq00,rinv00),_fjsp_sub_v2r8(rinvsq00,felec));
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+            fvdw             = _fjsp_mul_v2r8(_fjsp_msub_v2r8(c12_00,rinvsix,c6_00),_fjsp_mul_v2r8(rinvsix,rinvsq00));
+
+            fscal            = _fjsp_add_v2r8(felec,fvdw);
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r01              = _fjsp_mul_v2r8(rsq01,rinv01);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r01,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
+                                         &ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq01,rinv01),_fjsp_sub_v2r8(rinvsq01,felec));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx01,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy01,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz01,fscal,fiz0);
+            
+            fjx1             = _fjsp_madd_v2r8(dx01,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy01,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz01,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r02              = _fjsp_mul_v2r8(rsq02,rinv02);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r02,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
+                                         &ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq02,rinv02),_fjsp_sub_v2r8(rinvsq02,felec));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx02,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy02,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz02,fscal,fiz0);
+            
+            fjx2             = _fjsp_madd_v2r8(dx02,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy02,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz02,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r10              = _fjsp_mul_v2r8(rsq10,rinv10);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r10,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
+                                         &ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq10,rinv10),_fjsp_sub_v2r8(rinvsq10,felec));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r11              = _fjsp_mul_v2r8(rsq11,rinv11);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r11,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
+                                         &ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq11,rinv11),_fjsp_sub_v2r8(rinvsq11,felec));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+            
+            fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r12              = _fjsp_mul_v2r8(rsq12,rinv12);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r12,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
+                                         &ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq12,rinv12),_fjsp_sub_v2r8(rinvsq12,felec));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+            
+            fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r20              = _fjsp_mul_v2r8(rsq20,rinv20);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r20,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
+                                         &ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq20,rinv20),_fjsp_sub_v2r8(rinvsq20,felec));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r21              = _fjsp_mul_v2r8(rsq21,rinv21);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r21,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
+                                         &ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq21,rinv21),_fjsp_sub_v2r8(rinvsq21,felec));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+            
+            fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r22              = _fjsp_mul_v2r8(rsq22,rinv22);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r22,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
+                                         &ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq22,rinv22),_fjsp_sub_v2r8(rinvsq22,felec));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+            
+            fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+
+            gmx_fjsp_decrement_3rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+
+            /* Inner loop uses 358 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_3rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx01             = _fjsp_sub_v2r8(ix0,jx1);
+            dy01             = _fjsp_sub_v2r8(iy0,jy1);
+            dz01             = _fjsp_sub_v2r8(iz0,jz1);
+            dx02             = _fjsp_sub_v2r8(ix0,jx2);
+            dy02             = _fjsp_sub_v2r8(iy0,jy2);
+            dz02             = _fjsp_sub_v2r8(iz0,jz2);
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx11             = _fjsp_sub_v2r8(ix1,jx1);
+            dy11             = _fjsp_sub_v2r8(iy1,jy1);
+            dz11             = _fjsp_sub_v2r8(iz1,jz1);
+            dx12             = _fjsp_sub_v2r8(ix1,jx2);
+            dy12             = _fjsp_sub_v2r8(iy1,jy2);
+            dz12             = _fjsp_sub_v2r8(iz1,jz2);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+            dx21             = _fjsp_sub_v2r8(ix2,jx1);
+            dy21             = _fjsp_sub_v2r8(iy2,jy1);
+            dz21             = _fjsp_sub_v2r8(iz2,jz1);
+            dx22             = _fjsp_sub_v2r8(ix2,jx2);
+            dy22             = _fjsp_sub_v2r8(iy2,jy2);
+            dz22             = _fjsp_sub_v2r8(iz2,jz2);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq01            = gmx_fjsp_calc_rsq_v2r8(dx01,dy01,dz01);
+            rsq02            = gmx_fjsp_calc_rsq_v2r8(dx02,dy02,dz02);
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+            rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+            rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+            rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv01           = gmx_fjsp_invsqrt_v2r8(rsq01);
+            rinv02           = gmx_fjsp_invsqrt_v2r8(rsq02);
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+            rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+            rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+            rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+            rinvsq01         = _fjsp_mul_v2r8(rinv01,rinv01);
+            rinvsq02         = _fjsp_mul_v2r8(rinv02,rinv02);
+            rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+            rinvsq11         = _fjsp_mul_v2r8(rinv11,rinv11);
+            rinvsq12         = _fjsp_mul_v2r8(rinv12,rinv12);
+            rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+            rinvsq21         = _fjsp_mul_v2r8(rinv21,rinv21);
+            rinvsq22         = _fjsp_mul_v2r8(rinv22,rinv22);
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+            fjx1             = _fjsp_setzero_v2r8();
+            fjy1             = _fjsp_setzero_v2r8();
+            fjz1             = _fjsp_setzero_v2r8();
+            fjx2             = _fjsp_setzero_v2r8();
+            fjy2             = _fjsp_setzero_v2r8();
+            fjz2             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r00,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq00,rinv00),_fjsp_sub_v2r8(rinvsq00,felec));
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+            fvdw             = _fjsp_mul_v2r8(_fjsp_msub_v2r8(c12_00,rinvsix,c6_00),_fjsp_mul_v2r8(rinvsix,rinvsq00));
+
+            fscal            = _fjsp_add_v2r8(felec,fvdw);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r01              = _fjsp_mul_v2r8(rsq01,rinv01);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r01,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq01,rinv01),_fjsp_sub_v2r8(rinvsq01,felec));
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx01,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy01,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz01,fscal,fiz0);
+            
+            fjx1             = _fjsp_madd_v2r8(dx01,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy01,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz01,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r02              = _fjsp_mul_v2r8(rsq02,rinv02);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r02,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq02,rinv02),_fjsp_sub_v2r8(rinvsq02,felec));
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx02,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy02,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz02,fscal,fiz0);
+            
+            fjx2             = _fjsp_madd_v2r8(dx02,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy02,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz02,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r10              = _fjsp_mul_v2r8(rsq10,rinv10);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r10,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq10,rinv10),_fjsp_sub_v2r8(rinvsq10,felec));
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r11              = _fjsp_mul_v2r8(rsq11,rinv11);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r11,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq11,rinv11),_fjsp_sub_v2r8(rinvsq11,felec));
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+            
+            fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r12              = _fjsp_mul_v2r8(rsq12,rinv12);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r12,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq12,rinv12),_fjsp_sub_v2r8(rinvsq12,felec));
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+            
+            fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r20              = _fjsp_mul_v2r8(rsq20,rinv20);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r20,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq20,rinv20),_fjsp_sub_v2r8(rinvsq20,felec));
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r21              = _fjsp_mul_v2r8(rsq21,rinv21);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r21,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq21,rinv21),_fjsp_sub_v2r8(rinvsq21,felec));
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+            
+            fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r22              = _fjsp_mul_v2r8(rsq22,rinv22);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r22,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq22,rinv22),_fjsp_sub_v2r8(rinvsq22,felec));
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+            
+            fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+
+            gmx_fjsp_decrement_3rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+
+            /* Inner loop uses 358 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_fjsp_update_iforce_3atom_swizzle_v2r8(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 18 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_F,outeriter*18 + inneriter*358);
+}
diff --git a/src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecEw_VdwLJ_GeomW4P1_sparc64_hpc_ace_double.c b/src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecEw_VdwLJ_GeomW4P1_sparc64_hpc_ace_double.c
new file mode 100644 (file)
index 0000000..c21ef40
--- /dev/null
@@ -0,0 +1,1142 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2012, by the GROMACS development team, led by
+ * David van der Spoel, Berk Hess, Erik Lindahl, and including many
+ * others, as listed in the AUTHORS file in the top-level source
+ * directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*
+ * Note: this file was generated by the GROMACS sparc64_hpc_ace_double kernel generator.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "kernelutil_sparc64_hpc_ace_double.h"
+
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecEw_VdwLJ_GeomW4P1_VF_sparc64_hpc_ace_double
+ * Electrostatics interaction: Ewald
+ * VdW interaction:            LennardJones
+ * Geometry:                   Water4-Particle
+ * Calculate force/pot:        PotentialAndForce
+ */
+void
+nb_kernel_ElecEw_VdwLJ_GeomW4P1_VF_sparc64_hpc_ace_double
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwioffset3;
+    _fjsp_v2r8       ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+    int              vdwjidx0A,vdwjidx0B;
+    _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    _fjsp_v2r8       dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    _fjsp_v2r8       dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    _fjsp_v2r8       dx30,dy30,dz30,rsq30,rinv30,rinvsq30,r30,qq30,c6_30,c12_30;
+    _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
+    _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
+    _fjsp_v2r8       ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+    real             *ewtab;
+    _fjsp_v2r8       itab_tmp;
+    _fjsp_v2r8       dummy_mask,cutoff_mask;
+    _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+    _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+    union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+
+    sh_ewald         = gmx_fjsp_set1_v2r8(fr->ic->sh_ewald);
+    ewtab            = fr->ic->tabq_coul_FDV0;
+    ewtabscale       = gmx_fjsp_set1_v2r8(fr->ic->tabq_scale);
+    ewtabhalfspace   = gmx_fjsp_set1_v2r8(0.5/fr->ic->tabq_scale);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+    iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+    iq3              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+3]));
+    vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_fjsp_load_shift_and_4rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
+
+        fix0             = _fjsp_setzero_v2r8();
+        fiy0             = _fjsp_setzero_v2r8();
+        fiz0             = _fjsp_setzero_v2r8();
+        fix1             = _fjsp_setzero_v2r8();
+        fiy1             = _fjsp_setzero_v2r8();
+        fiz1             = _fjsp_setzero_v2r8();
+        fix2             = _fjsp_setzero_v2r8();
+        fiy2             = _fjsp_setzero_v2r8();
+        fiz2             = _fjsp_setzero_v2r8();
+        fix3             = _fjsp_setzero_v2r8();
+        fiy3             = _fjsp_setzero_v2r8();
+        fiz3             = _fjsp_setzero_v2r8();
+
+        /* Reset potential sums */
+        velecsum         = _fjsp_setzero_v2r8();
+        vvdwsum          = _fjsp_setzero_v2r8();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+            dx30             = _fjsp_sub_v2r8(ix3,jx0);
+            dy30             = _fjsp_sub_v2r8(iy3,jy0);
+            dz30             = _fjsp_sub_v2r8(iz3,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+            rsq30            = gmx_fjsp_calc_rsq_v2r8(dx30,dy30,dz30);
+
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+            rinv30           = gmx_fjsp_invsqrt_v2r8(rsq30);
+
+            rinvsq00         = gmx_fjsp_inv_v2r8(rsq00);
+            rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+            rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+            rinvsq30         = _fjsp_mul_v2r8(rinv30,rinv30);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_fjsp_load_2real_swizzle_v2r8(charge+jnrA+0,charge+jnrB+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            gmx_fjsp_load_2pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,&c6_00,&c12_00);
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+            vvdw6            = _fjsp_mul_v2r8(c6_00,rinvsix);
+            vvdw12           = _fjsp_mul_v2r8(c12_00,_fjsp_mul_v2r8(rinvsix,rinvsix));
+            vvdw             = _fjsp_msub_v2r8( vvdw12,one_twelfth, _fjsp_mul_v2r8(vvdw6,one_sixth) );
+            fvdw             = _fjsp_mul_v2r8(_fjsp_sub_v2r8(vvdw12,vvdw6),rinvsq00);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
+
+            fscal            = fvdw;
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r10              = _fjsp_mul_v2r8(rsq10,rinv10);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _fjsp_mul_v2r8(iq1,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r10,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq10,_fjsp_sub_v2r8(rinv10,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq10,rinv10),_fjsp_sub_v2r8(rinvsq10,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r20              = _fjsp_mul_v2r8(rsq20,rinv20);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _fjsp_mul_v2r8(iq2,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r20,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq20,_fjsp_sub_v2r8(rinv20,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq20,rinv20),_fjsp_sub_v2r8(rinvsq20,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r30              = _fjsp_mul_v2r8(rsq30,rinv30);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq30             = _fjsp_mul_v2r8(iq3,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r30,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq30,_fjsp_sub_v2r8(rinv30,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq30,rinv30),_fjsp_sub_v2r8(rinvsq30,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx30,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy30,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz30,fscal,fiz3);
+            
+            fjx0             = _fjsp_madd_v2r8(dx30,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy30,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz30,fscal,fjz0);
+
+            gmx_fjsp_decrement_1rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 170 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+            dx30             = _fjsp_sub_v2r8(ix3,jx0);
+            dy30             = _fjsp_sub_v2r8(iy3,jy0);
+            dz30             = _fjsp_sub_v2r8(iz3,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+            rsq30            = gmx_fjsp_calc_rsq_v2r8(dx30,dy30,dz30);
+
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+            rinv30           = gmx_fjsp_invsqrt_v2r8(rsq30);
+
+            rinvsq00         = gmx_fjsp_inv_v2r8(rsq00);
+            rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+            rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+            rinvsq30         = _fjsp_mul_v2r8(rinv30,rinv30);
+
+            /* Load parameters for j particles */
+            jq0              = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),charge+jnrA+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            gmx_fjsp_load_1pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,&c6_00,&c12_00);
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+            vvdw6            = _fjsp_mul_v2r8(c6_00,rinvsix);
+            vvdw12           = _fjsp_mul_v2r8(c12_00,_fjsp_mul_v2r8(rinvsix,rinvsix));
+            vvdw             = _fjsp_msub_v2r8( vvdw12,one_twelfth, _fjsp_mul_v2r8(vvdw6,one_sixth) );
+            fvdw             = _fjsp_mul_v2r8(_fjsp_sub_v2r8(vvdw12,vvdw6),rinvsq00);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            vvdw             = _fjsp_unpacklo_v2r8(vvdw,_fjsp_setzero_v2r8());
+            vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
+
+            fscal            = fvdw;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r10              = _fjsp_mul_v2r8(rsq10,rinv10);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _fjsp_mul_v2r8(iq1,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r10,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq10,_fjsp_sub_v2r8(rinv10,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq10,rinv10),_fjsp_sub_v2r8(rinvsq10,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r20              = _fjsp_mul_v2r8(rsq20,rinv20);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _fjsp_mul_v2r8(iq2,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r20,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq20,_fjsp_sub_v2r8(rinv20,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq20,rinv20),_fjsp_sub_v2r8(rinvsq20,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r30              = _fjsp_mul_v2r8(rsq30,rinv30);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq30             = _fjsp_mul_v2r8(iq3,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r30,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq30,_fjsp_sub_v2r8(rinv30,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq30,rinv30),_fjsp_sub_v2r8(rinvsq30,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx30,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy30,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz30,fscal,fiz3);
+            
+            fjx0             = _fjsp_madd_v2r8(dx30,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy30,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz30,fscal,fjz0);
+
+            gmx_fjsp_decrement_1rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 170 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_fjsp_update_iforce_4atom_swizzle_v2r8(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        ggid                        = gid[iidx];
+        /* Update potential energies */
+        gmx_fjsp_update_1pot_v2r8(velecsum,kernel_data->energygrp_elec+ggid);
+        gmx_fjsp_update_1pot_v2r8(vvdwsum,kernel_data->energygrp_vdw+ggid);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 26 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4_VF,outeriter*26 + inneriter*170);
+}
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecEw_VdwLJ_GeomW4P1_F_sparc64_hpc_ace_double
+ * Electrostatics interaction: Ewald
+ * VdW interaction:            LennardJones
+ * Geometry:                   Water4-Particle
+ * Calculate force/pot:        Force
+ */
+void
+nb_kernel_ElecEw_VdwLJ_GeomW4P1_F_sparc64_hpc_ace_double
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwioffset3;
+    _fjsp_v2r8       ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+    int              vdwjidx0A,vdwjidx0B;
+    _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    _fjsp_v2r8       dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    _fjsp_v2r8       dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    _fjsp_v2r8       dx30,dy30,dz30,rsq30,rinv30,rinvsq30,r30,qq30,c6_30,c12_30;
+    _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
+    _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
+    _fjsp_v2r8       ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+    real             *ewtab;
+    _fjsp_v2r8       itab_tmp;
+    _fjsp_v2r8       dummy_mask,cutoff_mask;
+    _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+    _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+    union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+
+    sh_ewald         = gmx_fjsp_set1_v2r8(fr->ic->sh_ewald);
+    ewtab            = fr->ic->tabq_coul_F;
+    ewtabscale       = gmx_fjsp_set1_v2r8(fr->ic->tabq_scale);
+    ewtabhalfspace   = gmx_fjsp_set1_v2r8(0.5/fr->ic->tabq_scale);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+    iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+    iq3              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+3]));
+    vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_fjsp_load_shift_and_4rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
+
+        fix0             = _fjsp_setzero_v2r8();
+        fiy0             = _fjsp_setzero_v2r8();
+        fiz0             = _fjsp_setzero_v2r8();
+        fix1             = _fjsp_setzero_v2r8();
+        fiy1             = _fjsp_setzero_v2r8();
+        fiz1             = _fjsp_setzero_v2r8();
+        fix2             = _fjsp_setzero_v2r8();
+        fiy2             = _fjsp_setzero_v2r8();
+        fiz2             = _fjsp_setzero_v2r8();
+        fix3             = _fjsp_setzero_v2r8();
+        fiy3             = _fjsp_setzero_v2r8();
+        fiz3             = _fjsp_setzero_v2r8();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+            dx30             = _fjsp_sub_v2r8(ix3,jx0);
+            dy30             = _fjsp_sub_v2r8(iy3,jy0);
+            dz30             = _fjsp_sub_v2r8(iz3,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+            rsq30            = gmx_fjsp_calc_rsq_v2r8(dx30,dy30,dz30);
+
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+            rinv30           = gmx_fjsp_invsqrt_v2r8(rsq30);
+
+            rinvsq00         = gmx_fjsp_inv_v2r8(rsq00);
+            rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+            rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+            rinvsq30         = _fjsp_mul_v2r8(rinv30,rinv30);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_fjsp_load_2real_swizzle_v2r8(charge+jnrA+0,charge+jnrB+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            gmx_fjsp_load_2pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,&c6_00,&c12_00);
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+            fvdw             = _fjsp_mul_v2r8(_fjsp_msub_v2r8(c12_00,rinvsix,c6_00),_fjsp_mul_v2r8(rinvsix,rinvsq00));
+
+            fscal            = fvdw;
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r10              = _fjsp_mul_v2r8(rsq10,rinv10);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _fjsp_mul_v2r8(iq1,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r10,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
+                                         &ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq10,rinv10),_fjsp_sub_v2r8(rinvsq10,felec));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r20              = _fjsp_mul_v2r8(rsq20,rinv20);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _fjsp_mul_v2r8(iq2,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r20,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
+                                         &ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq20,rinv20),_fjsp_sub_v2r8(rinvsq20,felec));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r30              = _fjsp_mul_v2r8(rsq30,rinv30);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq30             = _fjsp_mul_v2r8(iq3,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r30,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
+                                         &ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq30,rinv30),_fjsp_sub_v2r8(rinvsq30,felec));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx30,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy30,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz30,fscal,fiz3);
+            
+            fjx0             = _fjsp_madd_v2r8(dx30,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy30,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz30,fscal,fjz0);
+
+            gmx_fjsp_decrement_1rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 150 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+            dx30             = _fjsp_sub_v2r8(ix3,jx0);
+            dy30             = _fjsp_sub_v2r8(iy3,jy0);
+            dz30             = _fjsp_sub_v2r8(iz3,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+            rsq30            = gmx_fjsp_calc_rsq_v2r8(dx30,dy30,dz30);
+
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+            rinv30           = gmx_fjsp_invsqrt_v2r8(rsq30);
+
+            rinvsq00         = gmx_fjsp_inv_v2r8(rsq00);
+            rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+            rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+            rinvsq30         = _fjsp_mul_v2r8(rinv30,rinv30);
+
+            /* Load parameters for j particles */
+            jq0              = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),charge+jnrA+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            gmx_fjsp_load_1pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,&c6_00,&c12_00);
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+            fvdw             = _fjsp_mul_v2r8(_fjsp_msub_v2r8(c12_00,rinvsix,c6_00),_fjsp_mul_v2r8(rinvsix,rinvsq00));
+
+            fscal            = fvdw;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r10              = _fjsp_mul_v2r8(rsq10,rinv10);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _fjsp_mul_v2r8(iq1,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r10,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq10,rinv10),_fjsp_sub_v2r8(rinvsq10,felec));
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r20              = _fjsp_mul_v2r8(rsq20,rinv20);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _fjsp_mul_v2r8(iq2,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r20,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq20,rinv20),_fjsp_sub_v2r8(rinvsq20,felec));
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r30              = _fjsp_mul_v2r8(rsq30,rinv30);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq30             = _fjsp_mul_v2r8(iq3,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r30,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq30,rinv30),_fjsp_sub_v2r8(rinvsq30,felec));
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx30,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy30,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz30,fscal,fiz3);
+            
+            fjx0             = _fjsp_madd_v2r8(dx30,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy30,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz30,fscal,fjz0);
+
+            gmx_fjsp_decrement_1rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 150 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_fjsp_update_iforce_4atom_swizzle_v2r8(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 24 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4_F,outeriter*24 + inneriter*150);
+}
diff --git a/src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecEw_VdwLJ_GeomW4W4_sparc64_hpc_ace_double.c b/src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecEw_VdwLJ_GeomW4W4_sparc64_hpc_ace_double.c
new file mode 100644 (file)
index 0000000..faa0f8e
--- /dev/null
@@ -0,0 +1,2166 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2012, by the GROMACS development team, led by
+ * David van der Spoel, Berk Hess, Erik Lindahl, and including many
+ * others, as listed in the AUTHORS file in the top-level source
+ * directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*
+ * Note: this file was generated by the GROMACS sparc64_hpc_ace_double kernel generator.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "kernelutil_sparc64_hpc_ace_double.h"
+
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecEw_VdwLJ_GeomW4W4_VF_sparc64_hpc_ace_double
+ * Electrostatics interaction: Ewald
+ * VdW interaction:            LennardJones
+ * Geometry:                   Water4-Water4
+ * Calculate force/pot:        PotentialAndForce
+ */
+void
+nb_kernel_ElecEw_VdwLJ_GeomW4W4_VF_sparc64_hpc_ace_double
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwioffset3;
+    _fjsp_v2r8       ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+    int              vdwjidx0A,vdwjidx0B;
+    _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    int              vdwjidx1A,vdwjidx1B;
+    _fjsp_v2r8       jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+    int              vdwjidx2A,vdwjidx2B;
+    _fjsp_v2r8       jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+    int              vdwjidx3A,vdwjidx3B;
+    _fjsp_v2r8       jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
+    _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    _fjsp_v2r8       dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+    _fjsp_v2r8       dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+    _fjsp_v2r8       dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
+    _fjsp_v2r8       dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+    _fjsp_v2r8       dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+    _fjsp_v2r8       dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
+    _fjsp_v2r8       dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
+    _fjsp_v2r8       dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
+    _fjsp_v2r8       dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
+    _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
+    _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
+    _fjsp_v2r8       ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+    real             *ewtab;
+    _fjsp_v2r8       itab_tmp;
+    _fjsp_v2r8       dummy_mask,cutoff_mask;
+    _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+    _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+    union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+
+    sh_ewald         = gmx_fjsp_set1_v2r8(fr->ic->sh_ewald);
+    ewtab            = fr->ic->tabq_coul_FDV0;
+    ewtabscale       = gmx_fjsp_set1_v2r8(fr->ic->tabq_scale);
+    ewtabhalfspace   = gmx_fjsp_set1_v2r8(0.5/fr->ic->tabq_scale);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+    iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+    iq3              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+3]));
+    vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+    jq1              = gmx_fjsp_set1_v2r8(charge[inr+1]);
+    jq2              = gmx_fjsp_set1_v2r8(charge[inr+2]);
+    jq3              = gmx_fjsp_set1_v2r8(charge[inr+3]);
+    vdwjidx0A        = 2*vdwtype[inr+0];
+    c6_00            = gmx_fjsp_set1_v2r8(vdwparam[vdwioffset0+vdwjidx0A]);
+    c12_00           = gmx_fjsp_set1_v2r8(vdwparam[vdwioffset0+vdwjidx0A+1]);
+    qq11             = _fjsp_mul_v2r8(iq1,jq1);
+    qq12             = _fjsp_mul_v2r8(iq1,jq2);
+    qq13             = _fjsp_mul_v2r8(iq1,jq3);
+    qq21             = _fjsp_mul_v2r8(iq2,jq1);
+    qq22             = _fjsp_mul_v2r8(iq2,jq2);
+    qq23             = _fjsp_mul_v2r8(iq2,jq3);
+    qq31             = _fjsp_mul_v2r8(iq3,jq1);
+    qq32             = _fjsp_mul_v2r8(iq3,jq2);
+    qq33             = _fjsp_mul_v2r8(iq3,jq3);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_fjsp_load_shift_and_4rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
+
+        fix0             = _fjsp_setzero_v2r8();
+        fiy0             = _fjsp_setzero_v2r8();
+        fiz0             = _fjsp_setzero_v2r8();
+        fix1             = _fjsp_setzero_v2r8();
+        fiy1             = _fjsp_setzero_v2r8();
+        fiz1             = _fjsp_setzero_v2r8();
+        fix2             = _fjsp_setzero_v2r8();
+        fiy2             = _fjsp_setzero_v2r8();
+        fiz2             = _fjsp_setzero_v2r8();
+        fix3             = _fjsp_setzero_v2r8();
+        fiy3             = _fjsp_setzero_v2r8();
+        fiz3             = _fjsp_setzero_v2r8();
+
+        /* Reset potential sums */
+        velecsum         = _fjsp_setzero_v2r8();
+        vvdwsum          = _fjsp_setzero_v2r8();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_4rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
+                                              &jy2,&jz2,&jx3,&jy3,&jz3);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx11             = _fjsp_sub_v2r8(ix1,jx1);
+            dy11             = _fjsp_sub_v2r8(iy1,jy1);
+            dz11             = _fjsp_sub_v2r8(iz1,jz1);
+            dx12             = _fjsp_sub_v2r8(ix1,jx2);
+            dy12             = _fjsp_sub_v2r8(iy1,jy2);
+            dz12             = _fjsp_sub_v2r8(iz1,jz2);
+            dx13             = _fjsp_sub_v2r8(ix1,jx3);
+            dy13             = _fjsp_sub_v2r8(iy1,jy3);
+            dz13             = _fjsp_sub_v2r8(iz1,jz3);
+            dx21             = _fjsp_sub_v2r8(ix2,jx1);
+            dy21             = _fjsp_sub_v2r8(iy2,jy1);
+            dz21             = _fjsp_sub_v2r8(iz2,jz1);
+            dx22             = _fjsp_sub_v2r8(ix2,jx2);
+            dy22             = _fjsp_sub_v2r8(iy2,jy2);
+            dz22             = _fjsp_sub_v2r8(iz2,jz2);
+            dx23             = _fjsp_sub_v2r8(ix2,jx3);
+            dy23             = _fjsp_sub_v2r8(iy2,jy3);
+            dz23             = _fjsp_sub_v2r8(iz2,jz3);
+            dx31             = _fjsp_sub_v2r8(ix3,jx1);
+            dy31             = _fjsp_sub_v2r8(iy3,jy1);
+            dz31             = _fjsp_sub_v2r8(iz3,jz1);
+            dx32             = _fjsp_sub_v2r8(ix3,jx2);
+            dy32             = _fjsp_sub_v2r8(iy3,jy2);
+            dz32             = _fjsp_sub_v2r8(iz3,jz2);
+            dx33             = _fjsp_sub_v2r8(ix3,jx3);
+            dy33             = _fjsp_sub_v2r8(iy3,jy3);
+            dz33             = _fjsp_sub_v2r8(iz3,jz3);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+            rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+            rsq13            = gmx_fjsp_calc_rsq_v2r8(dx13,dy13,dz13);
+            rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+            rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+            rsq23            = gmx_fjsp_calc_rsq_v2r8(dx23,dy23,dz23);
+            rsq31            = gmx_fjsp_calc_rsq_v2r8(dx31,dy31,dz31);
+            rsq32            = gmx_fjsp_calc_rsq_v2r8(dx32,dy32,dz32);
+            rsq33            = gmx_fjsp_calc_rsq_v2r8(dx33,dy33,dz33);
+
+            rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+            rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+            rinv13           = gmx_fjsp_invsqrt_v2r8(rsq13);
+            rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+            rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+            rinv23           = gmx_fjsp_invsqrt_v2r8(rsq23);
+            rinv31           = gmx_fjsp_invsqrt_v2r8(rsq31);
+            rinv32           = gmx_fjsp_invsqrt_v2r8(rsq32);
+            rinv33           = gmx_fjsp_invsqrt_v2r8(rsq33);
+
+            rinvsq00         = gmx_fjsp_inv_v2r8(rsq00);
+            rinvsq11         = _fjsp_mul_v2r8(rinv11,rinv11);
+            rinvsq12         = _fjsp_mul_v2r8(rinv12,rinv12);
+            rinvsq13         = _fjsp_mul_v2r8(rinv13,rinv13);
+            rinvsq21         = _fjsp_mul_v2r8(rinv21,rinv21);
+            rinvsq22         = _fjsp_mul_v2r8(rinv22,rinv22);
+            rinvsq23         = _fjsp_mul_v2r8(rinv23,rinv23);
+            rinvsq31         = _fjsp_mul_v2r8(rinv31,rinv31);
+            rinvsq32         = _fjsp_mul_v2r8(rinv32,rinv32);
+            rinvsq33         = _fjsp_mul_v2r8(rinv33,rinv33);
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+            fjx1             = _fjsp_setzero_v2r8();
+            fjy1             = _fjsp_setzero_v2r8();
+            fjz1             = _fjsp_setzero_v2r8();
+            fjx2             = _fjsp_setzero_v2r8();
+            fjy2             = _fjsp_setzero_v2r8();
+            fjz2             = _fjsp_setzero_v2r8();
+            fjx3             = _fjsp_setzero_v2r8();
+            fjy3             = _fjsp_setzero_v2r8();
+            fjz3             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+            vvdw6            = _fjsp_mul_v2r8(c6_00,rinvsix);
+            vvdw12           = _fjsp_mul_v2r8(c12_00,_fjsp_mul_v2r8(rinvsix,rinvsix));
+            vvdw             = _fjsp_msub_v2r8( vvdw12,one_twelfth, _fjsp_mul_v2r8(vvdw6,one_sixth) );
+            fvdw             = _fjsp_mul_v2r8(_fjsp_sub_v2r8(vvdw12,vvdw6),rinvsq00);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
+
+            fscal            = fvdw;
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r11              = _fjsp_mul_v2r8(rsq11,rinv11);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r11,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq11,_fjsp_sub_v2r8(rinv11,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq11,rinv11),_fjsp_sub_v2r8(rinvsq11,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+            
+            fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r12              = _fjsp_mul_v2r8(rsq12,rinv12);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r12,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq12,_fjsp_sub_v2r8(rinv12,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq12,rinv12),_fjsp_sub_v2r8(rinvsq12,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+            
+            fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r13              = _fjsp_mul_v2r8(rsq13,rinv13);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r13,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq13,_fjsp_sub_v2r8(rinv13,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq13,rinv13),_fjsp_sub_v2r8(rinvsq13,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx13,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy13,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz13,fscal,fiz1);
+            
+            fjx3             = _fjsp_madd_v2r8(dx13,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy13,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz13,fscal,fjz3);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r21              = _fjsp_mul_v2r8(rsq21,rinv21);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r21,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq21,_fjsp_sub_v2r8(rinv21,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq21,rinv21),_fjsp_sub_v2r8(rinvsq21,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+            
+            fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r22              = _fjsp_mul_v2r8(rsq22,rinv22);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r22,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq22,_fjsp_sub_v2r8(rinv22,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq22,rinv22),_fjsp_sub_v2r8(rinvsq22,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+            
+            fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r23              = _fjsp_mul_v2r8(rsq23,rinv23);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r23,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq23,_fjsp_sub_v2r8(rinv23,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq23,rinv23),_fjsp_sub_v2r8(rinvsq23,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx23,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy23,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz23,fscal,fiz2);
+            
+            fjx3             = _fjsp_madd_v2r8(dx23,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy23,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz23,fscal,fjz3);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r31              = _fjsp_mul_v2r8(rsq31,rinv31);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r31,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq31,_fjsp_sub_v2r8(rinv31,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq31,rinv31),_fjsp_sub_v2r8(rinvsq31,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx31,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy31,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz31,fscal,fiz3);
+            
+            fjx1             = _fjsp_madd_v2r8(dx31,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy31,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz31,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r32              = _fjsp_mul_v2r8(rsq32,rinv32);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r32,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq32,_fjsp_sub_v2r8(rinv32,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq32,rinv32),_fjsp_sub_v2r8(rinvsq32,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx32,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy32,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz32,fscal,fiz3);
+            
+            fjx2             = _fjsp_madd_v2r8(dx32,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy32,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz32,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r33              = _fjsp_mul_v2r8(rsq33,rinv33);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r33,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq33,_fjsp_sub_v2r8(rinv33,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq33,rinv33),_fjsp_sub_v2r8(rinvsq33,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx33,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy33,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz33,fscal,fiz3);
+            
+            fjx3             = _fjsp_madd_v2r8(dx33,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy33,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz33,fscal,fjz3);
+
+            gmx_fjsp_decrement_4rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+
+            /* Inner loop uses 434 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_4rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
+                                              &jy2,&jz2,&jx3,&jy3,&jz3);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx11             = _fjsp_sub_v2r8(ix1,jx1);
+            dy11             = _fjsp_sub_v2r8(iy1,jy1);
+            dz11             = _fjsp_sub_v2r8(iz1,jz1);
+            dx12             = _fjsp_sub_v2r8(ix1,jx2);
+            dy12             = _fjsp_sub_v2r8(iy1,jy2);
+            dz12             = _fjsp_sub_v2r8(iz1,jz2);
+            dx13             = _fjsp_sub_v2r8(ix1,jx3);
+            dy13             = _fjsp_sub_v2r8(iy1,jy3);
+            dz13             = _fjsp_sub_v2r8(iz1,jz3);
+            dx21             = _fjsp_sub_v2r8(ix2,jx1);
+            dy21             = _fjsp_sub_v2r8(iy2,jy1);
+            dz21             = _fjsp_sub_v2r8(iz2,jz1);
+            dx22             = _fjsp_sub_v2r8(ix2,jx2);
+            dy22             = _fjsp_sub_v2r8(iy2,jy2);
+            dz22             = _fjsp_sub_v2r8(iz2,jz2);
+            dx23             = _fjsp_sub_v2r8(ix2,jx3);
+            dy23             = _fjsp_sub_v2r8(iy2,jy3);
+            dz23             = _fjsp_sub_v2r8(iz2,jz3);
+            dx31             = _fjsp_sub_v2r8(ix3,jx1);
+            dy31             = _fjsp_sub_v2r8(iy3,jy1);
+            dz31             = _fjsp_sub_v2r8(iz3,jz1);
+            dx32             = _fjsp_sub_v2r8(ix3,jx2);
+            dy32             = _fjsp_sub_v2r8(iy3,jy2);
+            dz32             = _fjsp_sub_v2r8(iz3,jz2);
+            dx33             = _fjsp_sub_v2r8(ix3,jx3);
+            dy33             = _fjsp_sub_v2r8(iy3,jy3);
+            dz33             = _fjsp_sub_v2r8(iz3,jz3);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+            rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+            rsq13            = gmx_fjsp_calc_rsq_v2r8(dx13,dy13,dz13);
+            rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+            rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+            rsq23            = gmx_fjsp_calc_rsq_v2r8(dx23,dy23,dz23);
+            rsq31            = gmx_fjsp_calc_rsq_v2r8(dx31,dy31,dz31);
+            rsq32            = gmx_fjsp_calc_rsq_v2r8(dx32,dy32,dz32);
+            rsq33            = gmx_fjsp_calc_rsq_v2r8(dx33,dy33,dz33);
+
+            rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+            rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+            rinv13           = gmx_fjsp_invsqrt_v2r8(rsq13);
+            rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+            rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+            rinv23           = gmx_fjsp_invsqrt_v2r8(rsq23);
+            rinv31           = gmx_fjsp_invsqrt_v2r8(rsq31);
+            rinv32           = gmx_fjsp_invsqrt_v2r8(rsq32);
+            rinv33           = gmx_fjsp_invsqrt_v2r8(rsq33);
+
+            rinvsq00         = gmx_fjsp_inv_v2r8(rsq00);
+            rinvsq11         = _fjsp_mul_v2r8(rinv11,rinv11);
+            rinvsq12         = _fjsp_mul_v2r8(rinv12,rinv12);
+            rinvsq13         = _fjsp_mul_v2r8(rinv13,rinv13);
+            rinvsq21         = _fjsp_mul_v2r8(rinv21,rinv21);
+            rinvsq22         = _fjsp_mul_v2r8(rinv22,rinv22);
+            rinvsq23         = _fjsp_mul_v2r8(rinv23,rinv23);
+            rinvsq31         = _fjsp_mul_v2r8(rinv31,rinv31);
+            rinvsq32         = _fjsp_mul_v2r8(rinv32,rinv32);
+            rinvsq33         = _fjsp_mul_v2r8(rinv33,rinv33);
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+            fjx1             = _fjsp_setzero_v2r8();
+            fjy1             = _fjsp_setzero_v2r8();
+            fjz1             = _fjsp_setzero_v2r8();
+            fjx2             = _fjsp_setzero_v2r8();
+            fjy2             = _fjsp_setzero_v2r8();
+            fjz2             = _fjsp_setzero_v2r8();
+            fjx3             = _fjsp_setzero_v2r8();
+            fjy3             = _fjsp_setzero_v2r8();
+            fjz3             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+            vvdw6            = _fjsp_mul_v2r8(c6_00,rinvsix);
+            vvdw12           = _fjsp_mul_v2r8(c12_00,_fjsp_mul_v2r8(rinvsix,rinvsix));
+            vvdw             = _fjsp_msub_v2r8( vvdw12,one_twelfth, _fjsp_mul_v2r8(vvdw6,one_sixth) );
+            fvdw             = _fjsp_mul_v2r8(_fjsp_sub_v2r8(vvdw12,vvdw6),rinvsq00);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            vvdw             = _fjsp_unpacklo_v2r8(vvdw,_fjsp_setzero_v2r8());
+            vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
+
+            fscal            = fvdw;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r11              = _fjsp_mul_v2r8(rsq11,rinv11);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r11,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq11,_fjsp_sub_v2r8(rinv11,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq11,rinv11),_fjsp_sub_v2r8(rinvsq11,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+            
+            fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r12              = _fjsp_mul_v2r8(rsq12,rinv12);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r12,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq12,_fjsp_sub_v2r8(rinv12,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq12,rinv12),_fjsp_sub_v2r8(rinvsq12,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+            
+            fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r13              = _fjsp_mul_v2r8(rsq13,rinv13);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r13,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq13,_fjsp_sub_v2r8(rinv13,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq13,rinv13),_fjsp_sub_v2r8(rinvsq13,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx13,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy13,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz13,fscal,fiz1);
+            
+            fjx3             = _fjsp_madd_v2r8(dx13,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy13,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz13,fscal,fjz3);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r21              = _fjsp_mul_v2r8(rsq21,rinv21);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r21,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq21,_fjsp_sub_v2r8(rinv21,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq21,rinv21),_fjsp_sub_v2r8(rinvsq21,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+            
+            fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r22              = _fjsp_mul_v2r8(rsq22,rinv22);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r22,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq22,_fjsp_sub_v2r8(rinv22,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq22,rinv22),_fjsp_sub_v2r8(rinvsq22,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+            
+            fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r23              = _fjsp_mul_v2r8(rsq23,rinv23);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r23,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq23,_fjsp_sub_v2r8(rinv23,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq23,rinv23),_fjsp_sub_v2r8(rinvsq23,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx23,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy23,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz23,fscal,fiz2);
+            
+            fjx3             = _fjsp_madd_v2r8(dx23,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy23,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz23,fscal,fjz3);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r31              = _fjsp_mul_v2r8(rsq31,rinv31);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r31,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq31,_fjsp_sub_v2r8(rinv31,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq31,rinv31),_fjsp_sub_v2r8(rinvsq31,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx31,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy31,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz31,fscal,fiz3);
+            
+            fjx1             = _fjsp_madd_v2r8(dx31,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy31,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz31,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r32              = _fjsp_mul_v2r8(rsq32,rinv32);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r32,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq32,_fjsp_sub_v2r8(rinv32,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq32,rinv32),_fjsp_sub_v2r8(rinvsq32,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx32,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy32,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz32,fscal,fiz3);
+            
+            fjx2             = _fjsp_madd_v2r8(dx32,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy32,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz32,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r33              = _fjsp_mul_v2r8(rsq33,rinv33);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r33,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq33,_fjsp_sub_v2r8(rinv33,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq33,rinv33),_fjsp_sub_v2r8(rinvsq33,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx33,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy33,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz33,fscal,fiz3);
+            
+            fjx3             = _fjsp_madd_v2r8(dx33,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy33,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz33,fscal,fjz3);
+
+            gmx_fjsp_decrement_4rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+
+            /* Inner loop uses 434 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_fjsp_update_iforce_4atom_swizzle_v2r8(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        ggid                        = gid[iidx];
+        /* Update potential energies */
+        gmx_fjsp_update_1pot_v2r8(velecsum,kernel_data->energygrp_elec+ggid);
+        gmx_fjsp_update_1pot_v2r8(vvdwsum,kernel_data->energygrp_vdw+ggid);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 26 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_VF,outeriter*26 + inneriter*434);
+}
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecEw_VdwLJ_GeomW4W4_F_sparc64_hpc_ace_double
+ * Electrostatics interaction: Ewald
+ * VdW interaction:            LennardJones
+ * Geometry:                   Water4-Water4
+ * Calculate force/pot:        Force
+ */
+void
+nb_kernel_ElecEw_VdwLJ_GeomW4W4_F_sparc64_hpc_ace_double
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwioffset3;
+    _fjsp_v2r8       ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+    int              vdwjidx0A,vdwjidx0B;
+    _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    int              vdwjidx1A,vdwjidx1B;
+    _fjsp_v2r8       jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+    int              vdwjidx2A,vdwjidx2B;
+    _fjsp_v2r8       jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+    int              vdwjidx3A,vdwjidx3B;
+    _fjsp_v2r8       jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
+    _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    _fjsp_v2r8       dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+    _fjsp_v2r8       dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+    _fjsp_v2r8       dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
+    _fjsp_v2r8       dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+    _fjsp_v2r8       dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+    _fjsp_v2r8       dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
+    _fjsp_v2r8       dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
+    _fjsp_v2r8       dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
+    _fjsp_v2r8       dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
+    _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
+    _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
+    _fjsp_v2r8       ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+    real             *ewtab;
+    _fjsp_v2r8       itab_tmp;
+    _fjsp_v2r8       dummy_mask,cutoff_mask;
+    _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+    _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+    union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+
+    sh_ewald         = gmx_fjsp_set1_v2r8(fr->ic->sh_ewald);
+    ewtab            = fr->ic->tabq_coul_F;
+    ewtabscale       = gmx_fjsp_set1_v2r8(fr->ic->tabq_scale);
+    ewtabhalfspace   = gmx_fjsp_set1_v2r8(0.5/fr->ic->tabq_scale);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+    iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+    iq3              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+3]));
+    vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+    jq1              = gmx_fjsp_set1_v2r8(charge[inr+1]);
+    jq2              = gmx_fjsp_set1_v2r8(charge[inr+2]);
+    jq3              = gmx_fjsp_set1_v2r8(charge[inr+3]);
+    vdwjidx0A        = 2*vdwtype[inr+0];
+    c6_00            = gmx_fjsp_set1_v2r8(vdwparam[vdwioffset0+vdwjidx0A]);
+    c12_00           = gmx_fjsp_set1_v2r8(vdwparam[vdwioffset0+vdwjidx0A+1]);
+    qq11             = _fjsp_mul_v2r8(iq1,jq1);
+    qq12             = _fjsp_mul_v2r8(iq1,jq2);
+    qq13             = _fjsp_mul_v2r8(iq1,jq3);
+    qq21             = _fjsp_mul_v2r8(iq2,jq1);
+    qq22             = _fjsp_mul_v2r8(iq2,jq2);
+    qq23             = _fjsp_mul_v2r8(iq2,jq3);
+    qq31             = _fjsp_mul_v2r8(iq3,jq1);
+    qq32             = _fjsp_mul_v2r8(iq3,jq2);
+    qq33             = _fjsp_mul_v2r8(iq3,jq3);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_fjsp_load_shift_and_4rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
+
+        fix0             = _fjsp_setzero_v2r8();
+        fiy0             = _fjsp_setzero_v2r8();
+        fiz0             = _fjsp_setzero_v2r8();
+        fix1             = _fjsp_setzero_v2r8();
+        fiy1             = _fjsp_setzero_v2r8();
+        fiz1             = _fjsp_setzero_v2r8();
+        fix2             = _fjsp_setzero_v2r8();
+        fiy2             = _fjsp_setzero_v2r8();
+        fiz2             = _fjsp_setzero_v2r8();
+        fix3             = _fjsp_setzero_v2r8();
+        fiy3             = _fjsp_setzero_v2r8();
+        fiz3             = _fjsp_setzero_v2r8();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_4rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
+                                              &jy2,&jz2,&jx3,&jy3,&jz3);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx11             = _fjsp_sub_v2r8(ix1,jx1);
+            dy11             = _fjsp_sub_v2r8(iy1,jy1);
+            dz11             = _fjsp_sub_v2r8(iz1,jz1);
+            dx12             = _fjsp_sub_v2r8(ix1,jx2);
+            dy12             = _fjsp_sub_v2r8(iy1,jy2);
+            dz12             = _fjsp_sub_v2r8(iz1,jz2);
+            dx13             = _fjsp_sub_v2r8(ix1,jx3);
+            dy13             = _fjsp_sub_v2r8(iy1,jy3);
+            dz13             = _fjsp_sub_v2r8(iz1,jz3);
+            dx21             = _fjsp_sub_v2r8(ix2,jx1);
+            dy21             = _fjsp_sub_v2r8(iy2,jy1);
+            dz21             = _fjsp_sub_v2r8(iz2,jz1);
+            dx22             = _fjsp_sub_v2r8(ix2,jx2);
+            dy22             = _fjsp_sub_v2r8(iy2,jy2);
+            dz22             = _fjsp_sub_v2r8(iz2,jz2);
+            dx23             = _fjsp_sub_v2r8(ix2,jx3);
+            dy23             = _fjsp_sub_v2r8(iy2,jy3);
+            dz23             = _fjsp_sub_v2r8(iz2,jz3);
+            dx31             = _fjsp_sub_v2r8(ix3,jx1);
+            dy31             = _fjsp_sub_v2r8(iy3,jy1);
+            dz31             = _fjsp_sub_v2r8(iz3,jz1);
+            dx32             = _fjsp_sub_v2r8(ix3,jx2);
+            dy32             = _fjsp_sub_v2r8(iy3,jy2);
+            dz32             = _fjsp_sub_v2r8(iz3,jz2);
+            dx33             = _fjsp_sub_v2r8(ix3,jx3);
+            dy33             = _fjsp_sub_v2r8(iy3,jy3);
+            dz33             = _fjsp_sub_v2r8(iz3,jz3);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+            rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+            rsq13            = gmx_fjsp_calc_rsq_v2r8(dx13,dy13,dz13);
+            rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+            rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+            rsq23            = gmx_fjsp_calc_rsq_v2r8(dx23,dy23,dz23);
+            rsq31            = gmx_fjsp_calc_rsq_v2r8(dx31,dy31,dz31);
+            rsq32            = gmx_fjsp_calc_rsq_v2r8(dx32,dy32,dz32);
+            rsq33            = gmx_fjsp_calc_rsq_v2r8(dx33,dy33,dz33);
+
+            rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+            rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+            rinv13           = gmx_fjsp_invsqrt_v2r8(rsq13);
+            rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+            rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+            rinv23           = gmx_fjsp_invsqrt_v2r8(rsq23);
+            rinv31           = gmx_fjsp_invsqrt_v2r8(rsq31);
+            rinv32           = gmx_fjsp_invsqrt_v2r8(rsq32);
+            rinv33           = gmx_fjsp_invsqrt_v2r8(rsq33);
+
+            rinvsq00         = gmx_fjsp_inv_v2r8(rsq00);
+            rinvsq11         = _fjsp_mul_v2r8(rinv11,rinv11);
+            rinvsq12         = _fjsp_mul_v2r8(rinv12,rinv12);
+            rinvsq13         = _fjsp_mul_v2r8(rinv13,rinv13);
+            rinvsq21         = _fjsp_mul_v2r8(rinv21,rinv21);
+            rinvsq22         = _fjsp_mul_v2r8(rinv22,rinv22);
+            rinvsq23         = _fjsp_mul_v2r8(rinv23,rinv23);
+            rinvsq31         = _fjsp_mul_v2r8(rinv31,rinv31);
+            rinvsq32         = _fjsp_mul_v2r8(rinv32,rinv32);
+            rinvsq33         = _fjsp_mul_v2r8(rinv33,rinv33);
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+            fjx1             = _fjsp_setzero_v2r8();
+            fjy1             = _fjsp_setzero_v2r8();
+            fjz1             = _fjsp_setzero_v2r8();
+            fjx2             = _fjsp_setzero_v2r8();
+            fjy2             = _fjsp_setzero_v2r8();
+            fjz2             = _fjsp_setzero_v2r8();
+            fjx3             = _fjsp_setzero_v2r8();
+            fjy3             = _fjsp_setzero_v2r8();
+            fjz3             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+            fvdw             = _fjsp_mul_v2r8(_fjsp_msub_v2r8(c12_00,rinvsix,c6_00),_fjsp_mul_v2r8(rinvsix,rinvsq00));
+
+            fscal            = fvdw;
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r11              = _fjsp_mul_v2r8(rsq11,rinv11);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r11,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
+                                         &ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq11,rinv11),_fjsp_sub_v2r8(rinvsq11,felec));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+            
+            fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r12              = _fjsp_mul_v2r8(rsq12,rinv12);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r12,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
+                                         &ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq12,rinv12),_fjsp_sub_v2r8(rinvsq12,felec));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+            
+            fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r13              = _fjsp_mul_v2r8(rsq13,rinv13);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r13,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
+                                         &ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq13,rinv13),_fjsp_sub_v2r8(rinvsq13,felec));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx13,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy13,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz13,fscal,fiz1);
+            
+            fjx3             = _fjsp_madd_v2r8(dx13,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy13,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz13,fscal,fjz3);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r21              = _fjsp_mul_v2r8(rsq21,rinv21);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r21,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
+                                         &ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq21,rinv21),_fjsp_sub_v2r8(rinvsq21,felec));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+            
+            fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r22              = _fjsp_mul_v2r8(rsq22,rinv22);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r22,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
+                                         &ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq22,rinv22),_fjsp_sub_v2r8(rinvsq22,felec));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+            
+            fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r23              = _fjsp_mul_v2r8(rsq23,rinv23);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r23,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
+                                         &ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq23,rinv23),_fjsp_sub_v2r8(rinvsq23,felec));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx23,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy23,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz23,fscal,fiz2);
+            
+            fjx3             = _fjsp_madd_v2r8(dx23,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy23,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz23,fscal,fjz3);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r31              = _fjsp_mul_v2r8(rsq31,rinv31);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r31,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
+                                         &ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq31,rinv31),_fjsp_sub_v2r8(rinvsq31,felec));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx31,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy31,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz31,fscal,fiz3);
+            
+            fjx1             = _fjsp_madd_v2r8(dx31,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy31,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz31,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r32              = _fjsp_mul_v2r8(rsq32,rinv32);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r32,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
+                                         &ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq32,rinv32),_fjsp_sub_v2r8(rinvsq32,felec));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx32,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy32,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz32,fscal,fiz3);
+            
+            fjx2             = _fjsp_madd_v2r8(dx32,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy32,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz32,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r33              = _fjsp_mul_v2r8(rsq33,rinv33);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r33,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
+                                         &ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq33,rinv33),_fjsp_sub_v2r8(rinvsq33,felec));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx33,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy33,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz33,fscal,fiz3);
+            
+            fjx3             = _fjsp_madd_v2r8(dx33,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy33,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz33,fscal,fjz3);
+
+            gmx_fjsp_decrement_4rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+
+            /* Inner loop uses 384 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_4rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
+                                              &jy2,&jz2,&jx3,&jy3,&jz3);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx11             = _fjsp_sub_v2r8(ix1,jx1);
+            dy11             = _fjsp_sub_v2r8(iy1,jy1);
+            dz11             = _fjsp_sub_v2r8(iz1,jz1);
+            dx12             = _fjsp_sub_v2r8(ix1,jx2);
+            dy12             = _fjsp_sub_v2r8(iy1,jy2);
+            dz12             = _fjsp_sub_v2r8(iz1,jz2);
+            dx13             = _fjsp_sub_v2r8(ix1,jx3);
+            dy13             = _fjsp_sub_v2r8(iy1,jy3);
+            dz13             = _fjsp_sub_v2r8(iz1,jz3);
+            dx21             = _fjsp_sub_v2r8(ix2,jx1);
+            dy21             = _fjsp_sub_v2r8(iy2,jy1);
+            dz21             = _fjsp_sub_v2r8(iz2,jz1);
+            dx22             = _fjsp_sub_v2r8(ix2,jx2);
+            dy22             = _fjsp_sub_v2r8(iy2,jy2);
+            dz22             = _fjsp_sub_v2r8(iz2,jz2);
+            dx23             = _fjsp_sub_v2r8(ix2,jx3);
+            dy23             = _fjsp_sub_v2r8(iy2,jy3);
+            dz23             = _fjsp_sub_v2r8(iz2,jz3);
+            dx31             = _fjsp_sub_v2r8(ix3,jx1);
+            dy31             = _fjsp_sub_v2r8(iy3,jy1);
+            dz31             = _fjsp_sub_v2r8(iz3,jz1);
+            dx32             = _fjsp_sub_v2r8(ix3,jx2);
+            dy32             = _fjsp_sub_v2r8(iy3,jy2);
+            dz32             = _fjsp_sub_v2r8(iz3,jz2);
+            dx33             = _fjsp_sub_v2r8(ix3,jx3);
+            dy33             = _fjsp_sub_v2r8(iy3,jy3);
+            dz33             = _fjsp_sub_v2r8(iz3,jz3);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+            rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+            rsq13            = gmx_fjsp_calc_rsq_v2r8(dx13,dy13,dz13);
+            rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+            rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+            rsq23            = gmx_fjsp_calc_rsq_v2r8(dx23,dy23,dz23);
+            rsq31            = gmx_fjsp_calc_rsq_v2r8(dx31,dy31,dz31);
+            rsq32            = gmx_fjsp_calc_rsq_v2r8(dx32,dy32,dz32);
+            rsq33            = gmx_fjsp_calc_rsq_v2r8(dx33,dy33,dz33);
+
+            rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+            rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+            rinv13           = gmx_fjsp_invsqrt_v2r8(rsq13);
+            rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+            rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+            rinv23           = gmx_fjsp_invsqrt_v2r8(rsq23);
+            rinv31           = gmx_fjsp_invsqrt_v2r8(rsq31);
+            rinv32           = gmx_fjsp_invsqrt_v2r8(rsq32);
+            rinv33           = gmx_fjsp_invsqrt_v2r8(rsq33);
+
+            rinvsq00         = gmx_fjsp_inv_v2r8(rsq00);
+            rinvsq11         = _fjsp_mul_v2r8(rinv11,rinv11);
+            rinvsq12         = _fjsp_mul_v2r8(rinv12,rinv12);
+            rinvsq13         = _fjsp_mul_v2r8(rinv13,rinv13);
+            rinvsq21         = _fjsp_mul_v2r8(rinv21,rinv21);
+            rinvsq22         = _fjsp_mul_v2r8(rinv22,rinv22);
+            rinvsq23         = _fjsp_mul_v2r8(rinv23,rinv23);
+            rinvsq31         = _fjsp_mul_v2r8(rinv31,rinv31);
+            rinvsq32         = _fjsp_mul_v2r8(rinv32,rinv32);
+            rinvsq33         = _fjsp_mul_v2r8(rinv33,rinv33);
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+            fjx1             = _fjsp_setzero_v2r8();
+            fjy1             = _fjsp_setzero_v2r8();
+            fjz1             = _fjsp_setzero_v2r8();
+            fjx2             = _fjsp_setzero_v2r8();
+            fjy2             = _fjsp_setzero_v2r8();
+            fjz2             = _fjsp_setzero_v2r8();
+            fjx3             = _fjsp_setzero_v2r8();
+            fjy3             = _fjsp_setzero_v2r8();
+            fjz3             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+            fvdw             = _fjsp_mul_v2r8(_fjsp_msub_v2r8(c12_00,rinvsix,c6_00),_fjsp_mul_v2r8(rinvsix,rinvsq00));
+
+            fscal            = fvdw;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r11              = _fjsp_mul_v2r8(rsq11,rinv11);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r11,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq11,rinv11),_fjsp_sub_v2r8(rinvsq11,felec));
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+            
+            fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r12              = _fjsp_mul_v2r8(rsq12,rinv12);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r12,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq12,rinv12),_fjsp_sub_v2r8(rinvsq12,felec));
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+            
+            fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r13              = _fjsp_mul_v2r8(rsq13,rinv13);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r13,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq13,rinv13),_fjsp_sub_v2r8(rinvsq13,felec));
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx13,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy13,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz13,fscal,fiz1);
+            
+            fjx3             = _fjsp_madd_v2r8(dx13,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy13,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz13,fscal,fjz3);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r21              = _fjsp_mul_v2r8(rsq21,rinv21);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r21,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq21,rinv21),_fjsp_sub_v2r8(rinvsq21,felec));
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+            
+            fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r22              = _fjsp_mul_v2r8(rsq22,rinv22);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r22,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq22,rinv22),_fjsp_sub_v2r8(rinvsq22,felec));
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+            
+            fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r23              = _fjsp_mul_v2r8(rsq23,rinv23);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r23,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq23,rinv23),_fjsp_sub_v2r8(rinvsq23,felec));
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx23,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy23,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz23,fscal,fiz2);
+            
+            fjx3             = _fjsp_madd_v2r8(dx23,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy23,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz23,fscal,fjz3);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r31              = _fjsp_mul_v2r8(rsq31,rinv31);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r31,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq31,rinv31),_fjsp_sub_v2r8(rinvsq31,felec));
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx31,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy31,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz31,fscal,fiz3);
+            
+            fjx1             = _fjsp_madd_v2r8(dx31,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy31,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz31,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r32              = _fjsp_mul_v2r8(rsq32,rinv32);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r32,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq32,rinv32),_fjsp_sub_v2r8(rinvsq32,felec));
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx32,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy32,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz32,fscal,fiz3);
+            
+            fjx2             = _fjsp_madd_v2r8(dx32,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy32,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz32,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r33              = _fjsp_mul_v2r8(rsq33,rinv33);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r33,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq33,rinv33),_fjsp_sub_v2r8(rinvsq33,felec));
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx33,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy33,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz33,fscal,fiz3);
+            
+            fjx3             = _fjsp_madd_v2r8(dx33,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy33,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz33,fscal,fjz3);
+
+            gmx_fjsp_decrement_4rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+
+            /* Inner loop uses 384 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_fjsp_update_iforce_4atom_swizzle_v2r8(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 24 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_F,outeriter*24 + inneriter*384);
+}
diff --git a/src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecEw_VdwNone_GeomP1P1_sparc64_hpc_ace_double.c b/src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecEw_VdwNone_GeomP1P1_sparc64_hpc_ace_double.c
new file mode 100644 (file)
index 0000000..a26a79b
--- /dev/null
@@ -0,0 +1,551 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2012, by the GROMACS development team, led by
+ * David van der Spoel, Berk Hess, Erik Lindahl, and including many
+ * others, as listed in the AUTHORS file in the top-level source
+ * directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*
+ * Note: this file was generated by the GROMACS sparc64_hpc_ace_double kernel generator.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "kernelutil_sparc64_hpc_ace_double.h"
+
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecEw_VdwNone_GeomP1P1_VF_sparc64_hpc_ace_double
+ * Electrostatics interaction: Ewald
+ * VdW interaction:            None
+ * Geometry:                   Particle-Particle
+ * Calculate force/pot:        PotentialAndForce
+ */
+void
+nb_kernel_ElecEw_VdwNone_GeomP1P1_VF_sparc64_hpc_ace_double
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwjidx0A,vdwjidx0B;
+    _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    _fjsp_v2r8       ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+    real             *ewtab;
+    _fjsp_v2r8       itab_tmp;
+    _fjsp_v2r8       dummy_mask,cutoff_mask;
+    _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+    _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+    union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+    charge           = mdatoms->chargeA;
+
+    sh_ewald         = gmx_fjsp_set1_v2r8(fr->ic->sh_ewald);
+    ewtab            = fr->ic->tabq_coul_FDV0;
+    ewtabscale       = gmx_fjsp_set1_v2r8(fr->ic->tabq_scale);
+    ewtabhalfspace   = gmx_fjsp_set1_v2r8(0.5/fr->ic->tabq_scale);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_fjsp_load_shift_and_1rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,&ix0,&iy0,&iz0);
+
+        fix0             = _fjsp_setzero_v2r8();
+        fiy0             = _fjsp_setzero_v2r8();
+        fiz0             = _fjsp_setzero_v2r8();
+
+        /* Load parameters for i particles */
+        iq0              = _fjsp_mul_v2r8(facel,gmx_fjsp_load1_v2r8(charge+inr+0));
+
+        /* Reset potential sums */
+        velecsum         = _fjsp_setzero_v2r8();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_fjsp_load_2real_swizzle_v2r8(charge+jnrA+0,charge+jnrB+0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _fjsp_mul_v2r8(iq0,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r00,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq00,_fjsp_sub_v2r8(rinv00,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq00,rinv00),_fjsp_sub_v2r8(rinvsq00,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            gmx_fjsp_decrement_fma_1rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fscal,dx00,dy00,dz00);
+
+            /* Inner loop uses 44 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+
+            /* Load parameters for j particles */
+            jq0              = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),charge+jnrA+0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _fjsp_mul_v2r8(iq0,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r00,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq00,_fjsp_sub_v2r8(rinv00,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq00,rinv00),_fjsp_sub_v2r8(rinvsq00,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            gmx_fjsp_decrement_fma_1rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fscal,dx00,dy00,dz00);
+
+            /* Inner loop uses 44 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_fjsp_update_iforce_1atom_swizzle_v2r8(fix0,fiy0,fiz0,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        ggid                        = gid[iidx];
+        /* Update potential energies */
+        gmx_fjsp_update_1pot_v2r8(velecsum,kernel_data->energygrp_elec+ggid);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 8 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VF,outeriter*8 + inneriter*44);
+}
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecEw_VdwNone_GeomP1P1_F_sparc64_hpc_ace_double
+ * Electrostatics interaction: Ewald
+ * VdW interaction:            None
+ * Geometry:                   Particle-Particle
+ * Calculate force/pot:        Force
+ */
+void
+nb_kernel_ElecEw_VdwNone_GeomP1P1_F_sparc64_hpc_ace_double
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwjidx0A,vdwjidx0B;
+    _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    _fjsp_v2r8       ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+    real             *ewtab;
+    _fjsp_v2r8       itab_tmp;
+    _fjsp_v2r8       dummy_mask,cutoff_mask;
+    _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+    _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+    union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+    charge           = mdatoms->chargeA;
+
+    sh_ewald         = gmx_fjsp_set1_v2r8(fr->ic->sh_ewald);
+    ewtab            = fr->ic->tabq_coul_F;
+    ewtabscale       = gmx_fjsp_set1_v2r8(fr->ic->tabq_scale);
+    ewtabhalfspace   = gmx_fjsp_set1_v2r8(0.5/fr->ic->tabq_scale);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_fjsp_load_shift_and_1rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,&ix0,&iy0,&iz0);
+
+        fix0             = _fjsp_setzero_v2r8();
+        fiy0             = _fjsp_setzero_v2r8();
+        fiz0             = _fjsp_setzero_v2r8();
+
+        /* Load parameters for i particles */
+        iq0              = _fjsp_mul_v2r8(facel,gmx_fjsp_load1_v2r8(charge+inr+0));
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_fjsp_load_2real_swizzle_v2r8(charge+jnrA+0,charge+jnrB+0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _fjsp_mul_v2r8(iq0,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r00,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
+                                         &ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq00,rinv00),_fjsp_sub_v2r8(rinvsq00,felec));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            gmx_fjsp_decrement_fma_1rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fscal,dx00,dy00,dz00);
+
+            /* Inner loop uses 39 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+
+            /* Load parameters for j particles */
+            jq0              = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),charge+jnrA+0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _fjsp_mul_v2r8(iq0,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r00,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq00,rinv00),_fjsp_sub_v2r8(rinvsq00,felec));
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            gmx_fjsp_decrement_fma_1rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fscal,dx00,dy00,dz00);
+
+            /* Inner loop uses 39 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_fjsp_update_iforce_1atom_swizzle_v2r8(fix0,fiy0,fiz0,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 7 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_F,outeriter*7 + inneriter*39);
+}
diff --git a/src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecEw_VdwNone_GeomW3P1_sparc64_hpc_ace_double.c b/src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecEw_VdwNone_GeomW3P1_sparc64_hpc_ace_double.c
new file mode 100644 (file)
index 0000000..9716ad0
--- /dev/null
@@ -0,0 +1,971 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2012, by the GROMACS development team, led by
+ * David van der Spoel, Berk Hess, Erik Lindahl, and including many
+ * others, as listed in the AUTHORS file in the top-level source
+ * directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*
+ * Note: this file was generated by the GROMACS sparc64_hpc_ace_double kernel generator.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "kernelutil_sparc64_hpc_ace_double.h"
+
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecEw_VdwNone_GeomW3P1_VF_sparc64_hpc_ace_double
+ * Electrostatics interaction: Ewald
+ * VdW interaction:            None
+ * Geometry:                   Water3-Particle
+ * Calculate force/pot:        PotentialAndForce
+ */
+void
+nb_kernel_ElecEw_VdwNone_GeomW3P1_VF_sparc64_hpc_ace_double
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwjidx0A,vdwjidx0B;
+    _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    _fjsp_v2r8       dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    _fjsp_v2r8       dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    _fjsp_v2r8       ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+    real             *ewtab;
+    _fjsp_v2r8       itab_tmp;
+    _fjsp_v2r8       dummy_mask,cutoff_mask;
+    _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+    _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+    union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+    charge           = mdatoms->chargeA;
+
+    sh_ewald         = gmx_fjsp_set1_v2r8(fr->ic->sh_ewald);
+    ewtab            = fr->ic->tabq_coul_FDV0;
+    ewtabscale       = gmx_fjsp_set1_v2r8(fr->ic->tabq_scale);
+    ewtabhalfspace   = gmx_fjsp_set1_v2r8(0.5/fr->ic->tabq_scale);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq0              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+0]));
+    iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+    iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_fjsp_load_shift_and_3rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
+
+        fix0             = _fjsp_setzero_v2r8();
+        fiy0             = _fjsp_setzero_v2r8();
+        fiz0             = _fjsp_setzero_v2r8();
+        fix1             = _fjsp_setzero_v2r8();
+        fiy1             = _fjsp_setzero_v2r8();
+        fiz1             = _fjsp_setzero_v2r8();
+        fix2             = _fjsp_setzero_v2r8();
+        fiy2             = _fjsp_setzero_v2r8();
+        fiz2             = _fjsp_setzero_v2r8();
+
+        /* Reset potential sums */
+        velecsum         = _fjsp_setzero_v2r8();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+            rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+            rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_fjsp_load_2real_swizzle_v2r8(charge+jnrA+0,charge+jnrB+0);
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _fjsp_mul_v2r8(iq0,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r00,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq00,_fjsp_sub_v2r8(rinv00,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq00,rinv00),_fjsp_sub_v2r8(rinvsq00,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r10              = _fjsp_mul_v2r8(rsq10,rinv10);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _fjsp_mul_v2r8(iq1,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r10,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq10,_fjsp_sub_v2r8(rinv10,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq10,rinv10),_fjsp_sub_v2r8(rinvsq10,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r20              = _fjsp_mul_v2r8(rsq20,rinv20);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _fjsp_mul_v2r8(iq2,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r20,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq20,_fjsp_sub_v2r8(rinv20,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq20,rinv20),_fjsp_sub_v2r8(rinvsq20,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            gmx_fjsp_decrement_1rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 135 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+            rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+            rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+
+            /* Load parameters for j particles */
+            jq0              = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),charge+jnrA+0);
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _fjsp_mul_v2r8(iq0,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r00,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq00,_fjsp_sub_v2r8(rinv00,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq00,rinv00),_fjsp_sub_v2r8(rinvsq00,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r10              = _fjsp_mul_v2r8(rsq10,rinv10);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _fjsp_mul_v2r8(iq1,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r10,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq10,_fjsp_sub_v2r8(rinv10,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq10,rinv10),_fjsp_sub_v2r8(rinvsq10,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r20              = _fjsp_mul_v2r8(rsq20,rinv20);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _fjsp_mul_v2r8(iq2,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r20,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq20,_fjsp_sub_v2r8(rinv20,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq20,rinv20),_fjsp_sub_v2r8(rinvsq20,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            gmx_fjsp_decrement_1rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 135 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_fjsp_update_iforce_3atom_swizzle_v2r8(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        ggid                        = gid[iidx];
+        /* Update potential energies */
+        gmx_fjsp_update_1pot_v2r8(velecsum,kernel_data->energygrp_elec+ggid);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 19 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W3_VF,outeriter*19 + inneriter*135);
+}
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecEw_VdwNone_GeomW3P1_F_sparc64_hpc_ace_double
+ * Electrostatics interaction: Ewald
+ * VdW interaction:            None
+ * Geometry:                   Water3-Particle
+ * Calculate force/pot:        Force
+ */
+void
+nb_kernel_ElecEw_VdwNone_GeomW3P1_F_sparc64_hpc_ace_double
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwjidx0A,vdwjidx0B;
+    _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    _fjsp_v2r8       dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    _fjsp_v2r8       dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    _fjsp_v2r8       ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+    real             *ewtab;
+    _fjsp_v2r8       itab_tmp;
+    _fjsp_v2r8       dummy_mask,cutoff_mask;
+    _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+    _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+    union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+    charge           = mdatoms->chargeA;
+
+    sh_ewald         = gmx_fjsp_set1_v2r8(fr->ic->sh_ewald);
+    ewtab            = fr->ic->tabq_coul_F;
+    ewtabscale       = gmx_fjsp_set1_v2r8(fr->ic->tabq_scale);
+    ewtabhalfspace   = gmx_fjsp_set1_v2r8(0.5/fr->ic->tabq_scale);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq0              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+0]));
+    iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+    iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_fjsp_load_shift_and_3rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
+
+        fix0             = _fjsp_setzero_v2r8();
+        fiy0             = _fjsp_setzero_v2r8();
+        fiz0             = _fjsp_setzero_v2r8();
+        fix1             = _fjsp_setzero_v2r8();
+        fiy1             = _fjsp_setzero_v2r8();
+        fiz1             = _fjsp_setzero_v2r8();
+        fix2             = _fjsp_setzero_v2r8();
+        fiy2             = _fjsp_setzero_v2r8();
+        fiz2             = _fjsp_setzero_v2r8();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+            rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+            rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_fjsp_load_2real_swizzle_v2r8(charge+jnrA+0,charge+jnrB+0);
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _fjsp_mul_v2r8(iq0,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r00,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
+                                         &ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq00,rinv00),_fjsp_sub_v2r8(rinvsq00,felec));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r10              = _fjsp_mul_v2r8(rsq10,rinv10);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _fjsp_mul_v2r8(iq1,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r10,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
+                                         &ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq10,rinv10),_fjsp_sub_v2r8(rinvsq10,felec));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r20              = _fjsp_mul_v2r8(rsq20,rinv20);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _fjsp_mul_v2r8(iq2,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r20,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
+                                         &ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq20,rinv20),_fjsp_sub_v2r8(rinvsq20,felec));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            gmx_fjsp_decrement_1rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 120 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+            rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+            rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+
+            /* Load parameters for j particles */
+            jq0              = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),charge+jnrA+0);
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _fjsp_mul_v2r8(iq0,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r00,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq00,rinv00),_fjsp_sub_v2r8(rinvsq00,felec));
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r10              = _fjsp_mul_v2r8(rsq10,rinv10);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _fjsp_mul_v2r8(iq1,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r10,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq10,rinv10),_fjsp_sub_v2r8(rinvsq10,felec));
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r20              = _fjsp_mul_v2r8(rsq20,rinv20);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _fjsp_mul_v2r8(iq2,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r20,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq20,rinv20),_fjsp_sub_v2r8(rinvsq20,felec));
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            gmx_fjsp_decrement_1rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 120 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_fjsp_update_iforce_3atom_swizzle_v2r8(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 18 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W3_F,outeriter*18 + inneriter*120);
+}
diff --git a/src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecEw_VdwNone_GeomW3W3_sparc64_hpc_ace_double.c b/src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecEw_VdwNone_GeomW3W3_sparc64_hpc_ace_double.c
new file mode 100644 (file)
index 0000000..91814d3
--- /dev/null
@@ -0,0 +1,1989 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2012, by the GROMACS development team, led by
+ * David van der Spoel, Berk Hess, Erik Lindahl, and including many
+ * others, as listed in the AUTHORS file in the top-level source
+ * directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*
+ * Note: this file was generated by the GROMACS sparc64_hpc_ace_double kernel generator.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "kernelutil_sparc64_hpc_ace_double.h"
+
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecEw_VdwNone_GeomW3W3_VF_sparc64_hpc_ace_double
+ * Electrostatics interaction: Ewald
+ * VdW interaction:            None
+ * Geometry:                   Water3-Water3
+ * Calculate force/pot:        PotentialAndForce
+ */
+void
+nb_kernel_ElecEw_VdwNone_GeomW3W3_VF_sparc64_hpc_ace_double
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwjidx0A,vdwjidx0B;
+    _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    int              vdwjidx1A,vdwjidx1B;
+    _fjsp_v2r8       jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+    int              vdwjidx2A,vdwjidx2B;
+    _fjsp_v2r8       jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+    _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    _fjsp_v2r8       dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
+    _fjsp_v2r8       dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
+    _fjsp_v2r8       dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    _fjsp_v2r8       dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+    _fjsp_v2r8       dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+    _fjsp_v2r8       dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    _fjsp_v2r8       dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+    _fjsp_v2r8       dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+    _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    _fjsp_v2r8       ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+    real             *ewtab;
+    _fjsp_v2r8       itab_tmp;
+    _fjsp_v2r8       dummy_mask,cutoff_mask;
+    _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+    _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+    union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+    charge           = mdatoms->chargeA;
+
+    sh_ewald         = gmx_fjsp_set1_v2r8(fr->ic->sh_ewald);
+    ewtab            = fr->ic->tabq_coul_FDV0;
+    ewtabscale       = gmx_fjsp_set1_v2r8(fr->ic->tabq_scale);
+    ewtabhalfspace   = gmx_fjsp_set1_v2r8(0.5/fr->ic->tabq_scale);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq0              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+0]));
+    iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+    iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+
+    jq0              = gmx_fjsp_set1_v2r8(charge[inr+0]);
+    jq1              = gmx_fjsp_set1_v2r8(charge[inr+1]);
+    jq2              = gmx_fjsp_set1_v2r8(charge[inr+2]);
+    qq00             = _fjsp_mul_v2r8(iq0,jq0);
+    qq01             = _fjsp_mul_v2r8(iq0,jq1);
+    qq02             = _fjsp_mul_v2r8(iq0,jq2);
+    qq10             = _fjsp_mul_v2r8(iq1,jq0);
+    qq11             = _fjsp_mul_v2r8(iq1,jq1);
+    qq12             = _fjsp_mul_v2r8(iq1,jq2);
+    qq20             = _fjsp_mul_v2r8(iq2,jq0);
+    qq21             = _fjsp_mul_v2r8(iq2,jq1);
+    qq22             = _fjsp_mul_v2r8(iq2,jq2);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_fjsp_load_shift_and_3rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
+
+        fix0             = _fjsp_setzero_v2r8();
+        fiy0             = _fjsp_setzero_v2r8();
+        fiz0             = _fjsp_setzero_v2r8();
+        fix1             = _fjsp_setzero_v2r8();
+        fiy1             = _fjsp_setzero_v2r8();
+        fiz1             = _fjsp_setzero_v2r8();
+        fix2             = _fjsp_setzero_v2r8();
+        fiy2             = _fjsp_setzero_v2r8();
+        fiz2             = _fjsp_setzero_v2r8();
+
+        /* Reset potential sums */
+        velecsum         = _fjsp_setzero_v2r8();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_3rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx01             = _fjsp_sub_v2r8(ix0,jx1);
+            dy01             = _fjsp_sub_v2r8(iy0,jy1);
+            dz01             = _fjsp_sub_v2r8(iz0,jz1);
+            dx02             = _fjsp_sub_v2r8(ix0,jx2);
+            dy02             = _fjsp_sub_v2r8(iy0,jy2);
+            dz02             = _fjsp_sub_v2r8(iz0,jz2);
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx11             = _fjsp_sub_v2r8(ix1,jx1);
+            dy11             = _fjsp_sub_v2r8(iy1,jy1);
+            dz11             = _fjsp_sub_v2r8(iz1,jz1);
+            dx12             = _fjsp_sub_v2r8(ix1,jx2);
+            dy12             = _fjsp_sub_v2r8(iy1,jy2);
+            dz12             = _fjsp_sub_v2r8(iz1,jz2);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+            dx21             = _fjsp_sub_v2r8(ix2,jx1);
+            dy21             = _fjsp_sub_v2r8(iy2,jy1);
+            dz21             = _fjsp_sub_v2r8(iz2,jz1);
+            dx22             = _fjsp_sub_v2r8(ix2,jx2);
+            dy22             = _fjsp_sub_v2r8(iy2,jy2);
+            dz22             = _fjsp_sub_v2r8(iz2,jz2);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq01            = gmx_fjsp_calc_rsq_v2r8(dx01,dy01,dz01);
+            rsq02            = gmx_fjsp_calc_rsq_v2r8(dx02,dy02,dz02);
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+            rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+            rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+            rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv01           = gmx_fjsp_invsqrt_v2r8(rsq01);
+            rinv02           = gmx_fjsp_invsqrt_v2r8(rsq02);
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+            rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+            rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+            rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+            rinvsq01         = _fjsp_mul_v2r8(rinv01,rinv01);
+            rinvsq02         = _fjsp_mul_v2r8(rinv02,rinv02);
+            rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+            rinvsq11         = _fjsp_mul_v2r8(rinv11,rinv11);
+            rinvsq12         = _fjsp_mul_v2r8(rinv12,rinv12);
+            rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+            rinvsq21         = _fjsp_mul_v2r8(rinv21,rinv21);
+            rinvsq22         = _fjsp_mul_v2r8(rinv22,rinv22);
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+            fjx1             = _fjsp_setzero_v2r8();
+            fjy1             = _fjsp_setzero_v2r8();
+            fjz1             = _fjsp_setzero_v2r8();
+            fjx2             = _fjsp_setzero_v2r8();
+            fjy2             = _fjsp_setzero_v2r8();
+            fjz2             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r00,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq00,_fjsp_sub_v2r8(rinv00,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq00,rinv00),_fjsp_sub_v2r8(rinvsq00,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r01              = _fjsp_mul_v2r8(rsq01,rinv01);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r01,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq01,_fjsp_sub_v2r8(rinv01,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq01,rinv01),_fjsp_sub_v2r8(rinvsq01,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx01,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy01,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz01,fscal,fiz0);
+            
+            fjx1             = _fjsp_madd_v2r8(dx01,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy01,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz01,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r02              = _fjsp_mul_v2r8(rsq02,rinv02);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r02,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq02,_fjsp_sub_v2r8(rinv02,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq02,rinv02),_fjsp_sub_v2r8(rinvsq02,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx02,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy02,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz02,fscal,fiz0);
+            
+            fjx2             = _fjsp_madd_v2r8(dx02,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy02,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz02,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r10              = _fjsp_mul_v2r8(rsq10,rinv10);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r10,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq10,_fjsp_sub_v2r8(rinv10,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq10,rinv10),_fjsp_sub_v2r8(rinvsq10,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r11              = _fjsp_mul_v2r8(rsq11,rinv11);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r11,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq11,_fjsp_sub_v2r8(rinv11,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq11,rinv11),_fjsp_sub_v2r8(rinvsq11,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+            
+            fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r12              = _fjsp_mul_v2r8(rsq12,rinv12);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r12,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq12,_fjsp_sub_v2r8(rinv12,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq12,rinv12),_fjsp_sub_v2r8(rinvsq12,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+            
+            fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r20              = _fjsp_mul_v2r8(rsq20,rinv20);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r20,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq20,_fjsp_sub_v2r8(rinv20,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq20,rinv20),_fjsp_sub_v2r8(rinvsq20,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r21              = _fjsp_mul_v2r8(rsq21,rinv21);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r21,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq21,_fjsp_sub_v2r8(rinv21,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq21,rinv21),_fjsp_sub_v2r8(rinvsq21,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+            
+            fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r22              = _fjsp_mul_v2r8(rsq22,rinv22);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r22,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq22,_fjsp_sub_v2r8(rinv22,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq22,rinv22),_fjsp_sub_v2r8(rinvsq22,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+            
+            fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+
+            gmx_fjsp_decrement_3rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+
+            /* Inner loop uses 396 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_3rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx01             = _fjsp_sub_v2r8(ix0,jx1);
+            dy01             = _fjsp_sub_v2r8(iy0,jy1);
+            dz01             = _fjsp_sub_v2r8(iz0,jz1);
+            dx02             = _fjsp_sub_v2r8(ix0,jx2);
+            dy02             = _fjsp_sub_v2r8(iy0,jy2);
+            dz02             = _fjsp_sub_v2r8(iz0,jz2);
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx11             = _fjsp_sub_v2r8(ix1,jx1);
+            dy11             = _fjsp_sub_v2r8(iy1,jy1);
+            dz11             = _fjsp_sub_v2r8(iz1,jz1);
+            dx12             = _fjsp_sub_v2r8(ix1,jx2);
+            dy12             = _fjsp_sub_v2r8(iy1,jy2);
+            dz12             = _fjsp_sub_v2r8(iz1,jz2);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+            dx21             = _fjsp_sub_v2r8(ix2,jx1);
+            dy21             = _fjsp_sub_v2r8(iy2,jy1);
+            dz21             = _fjsp_sub_v2r8(iz2,jz1);
+            dx22             = _fjsp_sub_v2r8(ix2,jx2);
+            dy22             = _fjsp_sub_v2r8(iy2,jy2);
+            dz22             = _fjsp_sub_v2r8(iz2,jz2);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq01            = gmx_fjsp_calc_rsq_v2r8(dx01,dy01,dz01);
+            rsq02            = gmx_fjsp_calc_rsq_v2r8(dx02,dy02,dz02);
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+            rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+            rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+            rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv01           = gmx_fjsp_invsqrt_v2r8(rsq01);
+            rinv02           = gmx_fjsp_invsqrt_v2r8(rsq02);
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+            rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+            rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+            rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+            rinvsq01         = _fjsp_mul_v2r8(rinv01,rinv01);
+            rinvsq02         = _fjsp_mul_v2r8(rinv02,rinv02);
+            rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+            rinvsq11         = _fjsp_mul_v2r8(rinv11,rinv11);
+            rinvsq12         = _fjsp_mul_v2r8(rinv12,rinv12);
+            rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+            rinvsq21         = _fjsp_mul_v2r8(rinv21,rinv21);
+            rinvsq22         = _fjsp_mul_v2r8(rinv22,rinv22);
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+            fjx1             = _fjsp_setzero_v2r8();
+            fjy1             = _fjsp_setzero_v2r8();
+            fjz1             = _fjsp_setzero_v2r8();
+            fjx2             = _fjsp_setzero_v2r8();
+            fjy2             = _fjsp_setzero_v2r8();
+            fjz2             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r00,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq00,_fjsp_sub_v2r8(rinv00,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq00,rinv00),_fjsp_sub_v2r8(rinvsq00,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r01              = _fjsp_mul_v2r8(rsq01,rinv01);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r01,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq01,_fjsp_sub_v2r8(rinv01,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq01,rinv01),_fjsp_sub_v2r8(rinvsq01,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx01,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy01,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz01,fscal,fiz0);
+            
+            fjx1             = _fjsp_madd_v2r8(dx01,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy01,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz01,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r02              = _fjsp_mul_v2r8(rsq02,rinv02);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r02,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq02,_fjsp_sub_v2r8(rinv02,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq02,rinv02),_fjsp_sub_v2r8(rinvsq02,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx02,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy02,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz02,fscal,fiz0);
+            
+            fjx2             = _fjsp_madd_v2r8(dx02,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy02,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz02,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r10              = _fjsp_mul_v2r8(rsq10,rinv10);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r10,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq10,_fjsp_sub_v2r8(rinv10,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq10,rinv10),_fjsp_sub_v2r8(rinvsq10,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r11              = _fjsp_mul_v2r8(rsq11,rinv11);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r11,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq11,_fjsp_sub_v2r8(rinv11,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq11,rinv11),_fjsp_sub_v2r8(rinvsq11,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+            
+            fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r12              = _fjsp_mul_v2r8(rsq12,rinv12);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r12,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq12,_fjsp_sub_v2r8(rinv12,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq12,rinv12),_fjsp_sub_v2r8(rinvsq12,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+            
+            fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r20              = _fjsp_mul_v2r8(rsq20,rinv20);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r20,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq20,_fjsp_sub_v2r8(rinv20,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq20,rinv20),_fjsp_sub_v2r8(rinvsq20,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r21              = _fjsp_mul_v2r8(rsq21,rinv21);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r21,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq21,_fjsp_sub_v2r8(rinv21,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq21,rinv21),_fjsp_sub_v2r8(rinvsq21,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+            
+            fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r22              = _fjsp_mul_v2r8(rsq22,rinv22);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r22,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq22,_fjsp_sub_v2r8(rinv22,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq22,rinv22),_fjsp_sub_v2r8(rinvsq22,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+            
+            fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+
+            gmx_fjsp_decrement_3rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+
+            /* Inner loop uses 396 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_fjsp_update_iforce_3atom_swizzle_v2r8(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        ggid                        = gid[iidx];
+        /* Update potential energies */
+        gmx_fjsp_update_1pot_v2r8(velecsum,kernel_data->energygrp_elec+ggid);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 19 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W3W3_VF,outeriter*19 + inneriter*396);
+}
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecEw_VdwNone_GeomW3W3_F_sparc64_hpc_ace_double
+ * Electrostatics interaction: Ewald
+ * VdW interaction:            None
+ * Geometry:                   Water3-Water3
+ * Calculate force/pot:        Force
+ */
+void
+nb_kernel_ElecEw_VdwNone_GeomW3W3_F_sparc64_hpc_ace_double
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwjidx0A,vdwjidx0B;
+    _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    int              vdwjidx1A,vdwjidx1B;
+    _fjsp_v2r8       jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+    int              vdwjidx2A,vdwjidx2B;
+    _fjsp_v2r8       jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+    _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    _fjsp_v2r8       dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
+    _fjsp_v2r8       dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
+    _fjsp_v2r8       dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    _fjsp_v2r8       dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+    _fjsp_v2r8       dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+    _fjsp_v2r8       dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    _fjsp_v2r8       dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+    _fjsp_v2r8       dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+    _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    _fjsp_v2r8       ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+    real             *ewtab;
+    _fjsp_v2r8       itab_tmp;
+    _fjsp_v2r8       dummy_mask,cutoff_mask;
+    _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+    _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+    union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+    charge           = mdatoms->chargeA;
+
+    sh_ewald         = gmx_fjsp_set1_v2r8(fr->ic->sh_ewald);
+    ewtab            = fr->ic->tabq_coul_F;
+    ewtabscale       = gmx_fjsp_set1_v2r8(fr->ic->tabq_scale);
+    ewtabhalfspace   = gmx_fjsp_set1_v2r8(0.5/fr->ic->tabq_scale);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq0              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+0]));
+    iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+    iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+
+    jq0              = gmx_fjsp_set1_v2r8(charge[inr+0]);
+    jq1              = gmx_fjsp_set1_v2r8(charge[inr+1]);
+    jq2              = gmx_fjsp_set1_v2r8(charge[inr+2]);
+    qq00             = _fjsp_mul_v2r8(iq0,jq0);
+    qq01             = _fjsp_mul_v2r8(iq0,jq1);
+    qq02             = _fjsp_mul_v2r8(iq0,jq2);
+    qq10             = _fjsp_mul_v2r8(iq1,jq0);
+    qq11             = _fjsp_mul_v2r8(iq1,jq1);
+    qq12             = _fjsp_mul_v2r8(iq1,jq2);
+    qq20             = _fjsp_mul_v2r8(iq2,jq0);
+    qq21             = _fjsp_mul_v2r8(iq2,jq1);
+    qq22             = _fjsp_mul_v2r8(iq2,jq2);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_fjsp_load_shift_and_3rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
+
+        fix0             = _fjsp_setzero_v2r8();
+        fiy0             = _fjsp_setzero_v2r8();
+        fiz0             = _fjsp_setzero_v2r8();
+        fix1             = _fjsp_setzero_v2r8();
+        fiy1             = _fjsp_setzero_v2r8();
+        fiz1             = _fjsp_setzero_v2r8();
+        fix2             = _fjsp_setzero_v2r8();
+        fiy2             = _fjsp_setzero_v2r8();
+        fiz2             = _fjsp_setzero_v2r8();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_3rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx01             = _fjsp_sub_v2r8(ix0,jx1);
+            dy01             = _fjsp_sub_v2r8(iy0,jy1);
+            dz01             = _fjsp_sub_v2r8(iz0,jz1);
+            dx02             = _fjsp_sub_v2r8(ix0,jx2);
+            dy02             = _fjsp_sub_v2r8(iy0,jy2);
+            dz02             = _fjsp_sub_v2r8(iz0,jz2);
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx11             = _fjsp_sub_v2r8(ix1,jx1);
+            dy11             = _fjsp_sub_v2r8(iy1,jy1);
+            dz11             = _fjsp_sub_v2r8(iz1,jz1);
+            dx12             = _fjsp_sub_v2r8(ix1,jx2);
+            dy12             = _fjsp_sub_v2r8(iy1,jy2);
+            dz12             = _fjsp_sub_v2r8(iz1,jz2);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+            dx21             = _fjsp_sub_v2r8(ix2,jx1);
+            dy21             = _fjsp_sub_v2r8(iy2,jy1);
+            dz21             = _fjsp_sub_v2r8(iz2,jz1);
+            dx22             = _fjsp_sub_v2r8(ix2,jx2);
+            dy22             = _fjsp_sub_v2r8(iy2,jy2);
+            dz22             = _fjsp_sub_v2r8(iz2,jz2);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq01            = gmx_fjsp_calc_rsq_v2r8(dx01,dy01,dz01);
+            rsq02            = gmx_fjsp_calc_rsq_v2r8(dx02,dy02,dz02);
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+            rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+            rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+            rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv01           = gmx_fjsp_invsqrt_v2r8(rsq01);
+            rinv02           = gmx_fjsp_invsqrt_v2r8(rsq02);
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+            rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+            rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+            rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+            rinvsq01         = _fjsp_mul_v2r8(rinv01,rinv01);
+            rinvsq02         = _fjsp_mul_v2r8(rinv02,rinv02);
+            rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+            rinvsq11         = _fjsp_mul_v2r8(rinv11,rinv11);
+            rinvsq12         = _fjsp_mul_v2r8(rinv12,rinv12);
+            rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+            rinvsq21         = _fjsp_mul_v2r8(rinv21,rinv21);
+            rinvsq22         = _fjsp_mul_v2r8(rinv22,rinv22);
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+            fjx1             = _fjsp_setzero_v2r8();
+            fjy1             = _fjsp_setzero_v2r8();
+            fjz1             = _fjsp_setzero_v2r8();
+            fjx2             = _fjsp_setzero_v2r8();
+            fjy2             = _fjsp_setzero_v2r8();
+            fjz2             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r00,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
+                                         &ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq00,rinv00),_fjsp_sub_v2r8(rinvsq00,felec));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r01              = _fjsp_mul_v2r8(rsq01,rinv01);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r01,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
+                                         &ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq01,rinv01),_fjsp_sub_v2r8(rinvsq01,felec));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx01,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy01,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz01,fscal,fiz0);
+            
+            fjx1             = _fjsp_madd_v2r8(dx01,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy01,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz01,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r02              = _fjsp_mul_v2r8(rsq02,rinv02);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r02,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
+                                         &ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq02,rinv02),_fjsp_sub_v2r8(rinvsq02,felec));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx02,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy02,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz02,fscal,fiz0);
+            
+            fjx2             = _fjsp_madd_v2r8(dx02,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy02,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz02,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r10              = _fjsp_mul_v2r8(rsq10,rinv10);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r10,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
+                                         &ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq10,rinv10),_fjsp_sub_v2r8(rinvsq10,felec));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r11              = _fjsp_mul_v2r8(rsq11,rinv11);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r11,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
+                                         &ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq11,rinv11),_fjsp_sub_v2r8(rinvsq11,felec));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+            
+            fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r12              = _fjsp_mul_v2r8(rsq12,rinv12);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r12,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
+                                         &ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq12,rinv12),_fjsp_sub_v2r8(rinvsq12,felec));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+            
+            fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r20              = _fjsp_mul_v2r8(rsq20,rinv20);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r20,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
+                                         &ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq20,rinv20),_fjsp_sub_v2r8(rinvsq20,felec));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r21              = _fjsp_mul_v2r8(rsq21,rinv21);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r21,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
+                                         &ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq21,rinv21),_fjsp_sub_v2r8(rinvsq21,felec));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+            
+            fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r22              = _fjsp_mul_v2r8(rsq22,rinv22);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r22,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
+                                         &ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq22,rinv22),_fjsp_sub_v2r8(rinvsq22,felec));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+            
+            fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+
+            gmx_fjsp_decrement_3rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+
+            /* Inner loop uses 351 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_3rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx01             = _fjsp_sub_v2r8(ix0,jx1);
+            dy01             = _fjsp_sub_v2r8(iy0,jy1);
+            dz01             = _fjsp_sub_v2r8(iz0,jz1);
+            dx02             = _fjsp_sub_v2r8(ix0,jx2);
+            dy02             = _fjsp_sub_v2r8(iy0,jy2);
+            dz02             = _fjsp_sub_v2r8(iz0,jz2);
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx11             = _fjsp_sub_v2r8(ix1,jx1);
+            dy11             = _fjsp_sub_v2r8(iy1,jy1);
+            dz11             = _fjsp_sub_v2r8(iz1,jz1);
+            dx12             = _fjsp_sub_v2r8(ix1,jx2);
+            dy12             = _fjsp_sub_v2r8(iy1,jy2);
+            dz12             = _fjsp_sub_v2r8(iz1,jz2);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+            dx21             = _fjsp_sub_v2r8(ix2,jx1);
+            dy21             = _fjsp_sub_v2r8(iy2,jy1);
+            dz21             = _fjsp_sub_v2r8(iz2,jz1);
+            dx22             = _fjsp_sub_v2r8(ix2,jx2);
+            dy22             = _fjsp_sub_v2r8(iy2,jy2);
+            dz22             = _fjsp_sub_v2r8(iz2,jz2);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq01            = gmx_fjsp_calc_rsq_v2r8(dx01,dy01,dz01);
+            rsq02            = gmx_fjsp_calc_rsq_v2r8(dx02,dy02,dz02);
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+            rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+            rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+            rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv01           = gmx_fjsp_invsqrt_v2r8(rsq01);
+            rinv02           = gmx_fjsp_invsqrt_v2r8(rsq02);
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+            rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+            rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+            rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+            rinvsq01         = _fjsp_mul_v2r8(rinv01,rinv01);
+            rinvsq02         = _fjsp_mul_v2r8(rinv02,rinv02);
+            rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+            rinvsq11         = _fjsp_mul_v2r8(rinv11,rinv11);
+            rinvsq12         = _fjsp_mul_v2r8(rinv12,rinv12);
+            rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+            rinvsq21         = _fjsp_mul_v2r8(rinv21,rinv21);
+            rinvsq22         = _fjsp_mul_v2r8(rinv22,rinv22);
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+            fjx1             = _fjsp_setzero_v2r8();
+            fjy1             = _fjsp_setzero_v2r8();
+            fjz1             = _fjsp_setzero_v2r8();
+            fjx2             = _fjsp_setzero_v2r8();
+            fjy2             = _fjsp_setzero_v2r8();
+            fjz2             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r00,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq00,rinv00),_fjsp_sub_v2r8(rinvsq00,felec));
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r01              = _fjsp_mul_v2r8(rsq01,rinv01);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r01,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq01,rinv01),_fjsp_sub_v2r8(rinvsq01,felec));
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx01,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy01,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz01,fscal,fiz0);
+            
+            fjx1             = _fjsp_madd_v2r8(dx01,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy01,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz01,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r02              = _fjsp_mul_v2r8(rsq02,rinv02);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r02,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq02,rinv02),_fjsp_sub_v2r8(rinvsq02,felec));
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx02,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy02,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz02,fscal,fiz0);
+            
+            fjx2             = _fjsp_madd_v2r8(dx02,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy02,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz02,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r10              = _fjsp_mul_v2r8(rsq10,rinv10);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r10,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq10,rinv10),_fjsp_sub_v2r8(rinvsq10,felec));
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r11              = _fjsp_mul_v2r8(rsq11,rinv11);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r11,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq11,rinv11),_fjsp_sub_v2r8(rinvsq11,felec));
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+            
+            fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r12              = _fjsp_mul_v2r8(rsq12,rinv12);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r12,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq12,rinv12),_fjsp_sub_v2r8(rinvsq12,felec));
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+            
+            fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r20              = _fjsp_mul_v2r8(rsq20,rinv20);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r20,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq20,rinv20),_fjsp_sub_v2r8(rinvsq20,felec));
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r21              = _fjsp_mul_v2r8(rsq21,rinv21);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r21,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq21,rinv21),_fjsp_sub_v2r8(rinvsq21,felec));
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+            
+            fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r22              = _fjsp_mul_v2r8(rsq22,rinv22);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r22,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq22,rinv22),_fjsp_sub_v2r8(rinvsq22,felec));
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+            
+            fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+
+            gmx_fjsp_decrement_3rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+
+            /* Inner loop uses 351 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_fjsp_update_iforce_3atom_swizzle_v2r8(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 18 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W3W3_F,outeriter*18 + inneriter*351);
+}
diff --git a/src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecEw_VdwNone_GeomW4P1_sparc64_hpc_ace_double.c b/src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecEw_VdwNone_GeomW4P1_sparc64_hpc_ace_double.c
new file mode 100644 (file)
index 0000000..c787566
--- /dev/null
@@ -0,0 +1,971 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2012, by the GROMACS development team, led by
+ * David van der Spoel, Berk Hess, Erik Lindahl, and including many
+ * others, as listed in the AUTHORS file in the top-level source
+ * directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*
+ * Note: this file was generated by the GROMACS sparc64_hpc_ace_double kernel generator.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "kernelutil_sparc64_hpc_ace_double.h"
+
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecEw_VdwNone_GeomW4P1_VF_sparc64_hpc_ace_double
+ * Electrostatics interaction: Ewald
+ * VdW interaction:            None
+ * Geometry:                   Water4-Particle
+ * Calculate force/pot:        PotentialAndForce
+ */
+void
+nb_kernel_ElecEw_VdwNone_GeomW4P1_VF_sparc64_hpc_ace_double
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset1;
+    _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwioffset3;
+    _fjsp_v2r8       ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+    int              vdwjidx0A,vdwjidx0B;
+    _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    _fjsp_v2r8       dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    _fjsp_v2r8       dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    _fjsp_v2r8       dx30,dy30,dz30,rsq30,rinv30,rinvsq30,r30,qq30,c6_30,c12_30;
+    _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    _fjsp_v2r8       ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+    real             *ewtab;
+    _fjsp_v2r8       itab_tmp;
+    _fjsp_v2r8       dummy_mask,cutoff_mask;
+    _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+    _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+    union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+    charge           = mdatoms->chargeA;
+
+    sh_ewald         = gmx_fjsp_set1_v2r8(fr->ic->sh_ewald);
+    ewtab            = fr->ic->tabq_coul_FDV0;
+    ewtabscale       = gmx_fjsp_set1_v2r8(fr->ic->tabq_scale);
+    ewtabhalfspace   = gmx_fjsp_set1_v2r8(0.5/fr->ic->tabq_scale);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+    iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+    iq3              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+3]));
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_fjsp_load_shift_and_3rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset+DIM,
+                                                 &ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
+
+        fix1             = _fjsp_setzero_v2r8();
+        fiy1             = _fjsp_setzero_v2r8();
+        fiz1             = _fjsp_setzero_v2r8();
+        fix2             = _fjsp_setzero_v2r8();
+        fiy2             = _fjsp_setzero_v2r8();
+        fiz2             = _fjsp_setzero_v2r8();
+        fix3             = _fjsp_setzero_v2r8();
+        fiy3             = _fjsp_setzero_v2r8();
+        fiz3             = _fjsp_setzero_v2r8();
+
+        /* Reset potential sums */
+        velecsum         = _fjsp_setzero_v2r8();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+            dx30             = _fjsp_sub_v2r8(ix3,jx0);
+            dy30             = _fjsp_sub_v2r8(iy3,jy0);
+            dz30             = _fjsp_sub_v2r8(iz3,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+            rsq30            = gmx_fjsp_calc_rsq_v2r8(dx30,dy30,dz30);
+
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+            rinv30           = gmx_fjsp_invsqrt_v2r8(rsq30);
+
+            rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+            rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+            rinvsq30         = _fjsp_mul_v2r8(rinv30,rinv30);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_fjsp_load_2real_swizzle_v2r8(charge+jnrA+0,charge+jnrB+0);
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r10              = _fjsp_mul_v2r8(rsq10,rinv10);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _fjsp_mul_v2r8(iq1,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r10,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq10,_fjsp_sub_v2r8(rinv10,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq10,rinv10),_fjsp_sub_v2r8(rinvsq10,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r20              = _fjsp_mul_v2r8(rsq20,rinv20);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _fjsp_mul_v2r8(iq2,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r20,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq20,_fjsp_sub_v2r8(rinv20,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq20,rinv20),_fjsp_sub_v2r8(rinvsq20,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r30              = _fjsp_mul_v2r8(rsq30,rinv30);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq30             = _fjsp_mul_v2r8(iq3,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r30,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq30,_fjsp_sub_v2r8(rinv30,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq30,rinv30),_fjsp_sub_v2r8(rinvsq30,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx30,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy30,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz30,fscal,fiz3);
+            
+            fjx0             = _fjsp_madd_v2r8(dx30,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy30,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz30,fscal,fjz0);
+
+            gmx_fjsp_decrement_1rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 135 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+            dx30             = _fjsp_sub_v2r8(ix3,jx0);
+            dy30             = _fjsp_sub_v2r8(iy3,jy0);
+            dz30             = _fjsp_sub_v2r8(iz3,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+            rsq30            = gmx_fjsp_calc_rsq_v2r8(dx30,dy30,dz30);
+
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+            rinv30           = gmx_fjsp_invsqrt_v2r8(rsq30);
+
+            rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+            rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+            rinvsq30         = _fjsp_mul_v2r8(rinv30,rinv30);
+
+            /* Load parameters for j particles */
+            jq0              = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),charge+jnrA+0);
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r10              = _fjsp_mul_v2r8(rsq10,rinv10);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _fjsp_mul_v2r8(iq1,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r10,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq10,_fjsp_sub_v2r8(rinv10,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq10,rinv10),_fjsp_sub_v2r8(rinvsq10,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r20              = _fjsp_mul_v2r8(rsq20,rinv20);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _fjsp_mul_v2r8(iq2,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r20,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq20,_fjsp_sub_v2r8(rinv20,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq20,rinv20),_fjsp_sub_v2r8(rinvsq20,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r30              = _fjsp_mul_v2r8(rsq30,rinv30);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq30             = _fjsp_mul_v2r8(iq3,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r30,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq30,_fjsp_sub_v2r8(rinv30,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq30,rinv30),_fjsp_sub_v2r8(rinvsq30,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx30,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy30,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz30,fscal,fiz3);
+            
+            fjx0             = _fjsp_madd_v2r8(dx30,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy30,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz30,fscal,fjz0);
+
+            gmx_fjsp_decrement_1rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 135 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_fjsp_update_iforce_3atom_swizzle_v2r8(fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+                                              f+i_coord_offset+DIM,fshift+i_shift_offset);
+
+        ggid                        = gid[iidx];
+        /* Update potential energies */
+        gmx_fjsp_update_1pot_v2r8(velecsum,kernel_data->energygrp_elec+ggid);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 19 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W4_VF,outeriter*19 + inneriter*135);
+}
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecEw_VdwNone_GeomW4P1_F_sparc64_hpc_ace_double
+ * Electrostatics interaction: Ewald
+ * VdW interaction:            None
+ * Geometry:                   Water4-Particle
+ * Calculate force/pot:        Force
+ */
+void
+nb_kernel_ElecEw_VdwNone_GeomW4P1_F_sparc64_hpc_ace_double
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset1;
+    _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwioffset3;
+    _fjsp_v2r8       ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+    int              vdwjidx0A,vdwjidx0B;
+    _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    _fjsp_v2r8       dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    _fjsp_v2r8       dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    _fjsp_v2r8       dx30,dy30,dz30,rsq30,rinv30,rinvsq30,r30,qq30,c6_30,c12_30;
+    _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    _fjsp_v2r8       ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+    real             *ewtab;
+    _fjsp_v2r8       itab_tmp;
+    _fjsp_v2r8       dummy_mask,cutoff_mask;
+    _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+    _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+    union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+    charge           = mdatoms->chargeA;
+
+    sh_ewald         = gmx_fjsp_set1_v2r8(fr->ic->sh_ewald);
+    ewtab            = fr->ic->tabq_coul_F;
+    ewtabscale       = gmx_fjsp_set1_v2r8(fr->ic->tabq_scale);
+    ewtabhalfspace   = gmx_fjsp_set1_v2r8(0.5/fr->ic->tabq_scale);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+    iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+    iq3              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+3]));
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_fjsp_load_shift_and_3rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset+DIM,
+                                                 &ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
+
+        fix1             = _fjsp_setzero_v2r8();
+        fiy1             = _fjsp_setzero_v2r8();
+        fiz1             = _fjsp_setzero_v2r8();
+        fix2             = _fjsp_setzero_v2r8();
+        fiy2             = _fjsp_setzero_v2r8();
+        fiz2             = _fjsp_setzero_v2r8();
+        fix3             = _fjsp_setzero_v2r8();
+        fiy3             = _fjsp_setzero_v2r8();
+        fiz3             = _fjsp_setzero_v2r8();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+            dx30             = _fjsp_sub_v2r8(ix3,jx0);
+            dy30             = _fjsp_sub_v2r8(iy3,jy0);
+            dz30             = _fjsp_sub_v2r8(iz3,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+            rsq30            = gmx_fjsp_calc_rsq_v2r8(dx30,dy30,dz30);
+
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+            rinv30           = gmx_fjsp_invsqrt_v2r8(rsq30);
+
+            rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+            rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+            rinvsq30         = _fjsp_mul_v2r8(rinv30,rinv30);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_fjsp_load_2real_swizzle_v2r8(charge+jnrA+0,charge+jnrB+0);
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r10              = _fjsp_mul_v2r8(rsq10,rinv10);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _fjsp_mul_v2r8(iq1,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r10,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
+                                         &ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq10,rinv10),_fjsp_sub_v2r8(rinvsq10,felec));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r20              = _fjsp_mul_v2r8(rsq20,rinv20);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _fjsp_mul_v2r8(iq2,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r20,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
+                                         &ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq20,rinv20),_fjsp_sub_v2r8(rinvsq20,felec));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r30              = _fjsp_mul_v2r8(rsq30,rinv30);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq30             = _fjsp_mul_v2r8(iq3,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r30,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
+                                         &ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq30,rinv30),_fjsp_sub_v2r8(rinvsq30,felec));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx30,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy30,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz30,fscal,fiz3);
+            
+            fjx0             = _fjsp_madd_v2r8(dx30,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy30,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz30,fscal,fjz0);
+
+            gmx_fjsp_decrement_1rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 120 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+            dx30             = _fjsp_sub_v2r8(ix3,jx0);
+            dy30             = _fjsp_sub_v2r8(iy3,jy0);
+            dz30             = _fjsp_sub_v2r8(iz3,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+            rsq30            = gmx_fjsp_calc_rsq_v2r8(dx30,dy30,dz30);
+
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+            rinv30           = gmx_fjsp_invsqrt_v2r8(rsq30);
+
+            rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+            rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+            rinvsq30         = _fjsp_mul_v2r8(rinv30,rinv30);
+
+            /* Load parameters for j particles */
+            jq0              = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),charge+jnrA+0);
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r10              = _fjsp_mul_v2r8(rsq10,rinv10);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _fjsp_mul_v2r8(iq1,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r10,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq10,rinv10),_fjsp_sub_v2r8(rinvsq10,felec));
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r20              = _fjsp_mul_v2r8(rsq20,rinv20);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _fjsp_mul_v2r8(iq2,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r20,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq20,rinv20),_fjsp_sub_v2r8(rinvsq20,felec));
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r30              = _fjsp_mul_v2r8(rsq30,rinv30);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq30             = _fjsp_mul_v2r8(iq3,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r30,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq30,rinv30),_fjsp_sub_v2r8(rinvsq30,felec));
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx30,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy30,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz30,fscal,fiz3);
+            
+            fjx0             = _fjsp_madd_v2r8(dx30,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy30,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz30,fscal,fjz0);
+
+            gmx_fjsp_decrement_1rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 120 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_fjsp_update_iforce_3atom_swizzle_v2r8(fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+                                              f+i_coord_offset+DIM,fshift+i_shift_offset);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 18 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W4_F,outeriter*18 + inneriter*120);
+}
diff --git a/src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecEw_VdwNone_GeomW4W4_sparc64_hpc_ace_double.c b/src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecEw_VdwNone_GeomW4W4_sparc64_hpc_ace_double.c
new file mode 100644 (file)
index 0000000..24cbc9a
--- /dev/null
@@ -0,0 +1,1989 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2012, by the GROMACS development team, led by
+ * David van der Spoel, Berk Hess, Erik Lindahl, and including many
+ * others, as listed in the AUTHORS file in the top-level source
+ * directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*
+ * Note: this file was generated by the GROMACS sparc64_hpc_ace_double kernel generator.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "kernelutil_sparc64_hpc_ace_double.h"
+
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecEw_VdwNone_GeomW4W4_VF_sparc64_hpc_ace_double
+ * Electrostatics interaction: Ewald
+ * VdW interaction:            None
+ * Geometry:                   Water4-Water4
+ * Calculate force/pot:        PotentialAndForce
+ */
+void
+nb_kernel_ElecEw_VdwNone_GeomW4W4_VF_sparc64_hpc_ace_double
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset1;
+    _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwioffset3;
+    _fjsp_v2r8       ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+    int              vdwjidx1A,vdwjidx1B;
+    _fjsp_v2r8       jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+    int              vdwjidx2A,vdwjidx2B;
+    _fjsp_v2r8       jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+    int              vdwjidx3A,vdwjidx3B;
+    _fjsp_v2r8       jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
+    _fjsp_v2r8       dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+    _fjsp_v2r8       dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+    _fjsp_v2r8       dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
+    _fjsp_v2r8       dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+    _fjsp_v2r8       dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+    _fjsp_v2r8       dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
+    _fjsp_v2r8       dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
+    _fjsp_v2r8       dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
+    _fjsp_v2r8       dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
+    _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    _fjsp_v2r8       ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+    real             *ewtab;
+    _fjsp_v2r8       itab_tmp;
+    _fjsp_v2r8       dummy_mask,cutoff_mask;
+    _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+    _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+    union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+    charge           = mdatoms->chargeA;
+
+    sh_ewald         = gmx_fjsp_set1_v2r8(fr->ic->sh_ewald);
+    ewtab            = fr->ic->tabq_coul_FDV0;
+    ewtabscale       = gmx_fjsp_set1_v2r8(fr->ic->tabq_scale);
+    ewtabhalfspace   = gmx_fjsp_set1_v2r8(0.5/fr->ic->tabq_scale);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+    iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+    iq3              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+3]));
+
+    jq1              = gmx_fjsp_set1_v2r8(charge[inr+1]);
+    jq2              = gmx_fjsp_set1_v2r8(charge[inr+2]);
+    jq3              = gmx_fjsp_set1_v2r8(charge[inr+3]);
+    qq11             = _fjsp_mul_v2r8(iq1,jq1);
+    qq12             = _fjsp_mul_v2r8(iq1,jq2);
+    qq13             = _fjsp_mul_v2r8(iq1,jq3);
+    qq21             = _fjsp_mul_v2r8(iq2,jq1);
+    qq22             = _fjsp_mul_v2r8(iq2,jq2);
+    qq23             = _fjsp_mul_v2r8(iq2,jq3);
+    qq31             = _fjsp_mul_v2r8(iq3,jq1);
+    qq32             = _fjsp_mul_v2r8(iq3,jq2);
+    qq33             = _fjsp_mul_v2r8(iq3,jq3);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_fjsp_load_shift_and_3rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset+DIM,
+                                                 &ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
+
+        fix1             = _fjsp_setzero_v2r8();
+        fiy1             = _fjsp_setzero_v2r8();
+        fiz1             = _fjsp_setzero_v2r8();
+        fix2             = _fjsp_setzero_v2r8();
+        fiy2             = _fjsp_setzero_v2r8();
+        fiz2             = _fjsp_setzero_v2r8();
+        fix3             = _fjsp_setzero_v2r8();
+        fiy3             = _fjsp_setzero_v2r8();
+        fiz3             = _fjsp_setzero_v2r8();
+
+        /* Reset potential sums */
+        velecsum         = _fjsp_setzero_v2r8();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_3rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA+DIM,x+j_coord_offsetB+DIM,
+                                              &jx1,&jy1,&jz1,&jx2,&jy2,&jz2,&jx3,&jy3,&jz3);
+
+            /* Calculate displacement vector */
+            dx11             = _fjsp_sub_v2r8(ix1,jx1);
+            dy11             = _fjsp_sub_v2r8(iy1,jy1);
+            dz11             = _fjsp_sub_v2r8(iz1,jz1);
+            dx12             = _fjsp_sub_v2r8(ix1,jx2);
+            dy12             = _fjsp_sub_v2r8(iy1,jy2);
+            dz12             = _fjsp_sub_v2r8(iz1,jz2);
+            dx13             = _fjsp_sub_v2r8(ix1,jx3);
+            dy13             = _fjsp_sub_v2r8(iy1,jy3);
+            dz13             = _fjsp_sub_v2r8(iz1,jz3);
+            dx21             = _fjsp_sub_v2r8(ix2,jx1);
+            dy21             = _fjsp_sub_v2r8(iy2,jy1);
+            dz21             = _fjsp_sub_v2r8(iz2,jz1);
+            dx22             = _fjsp_sub_v2r8(ix2,jx2);
+            dy22             = _fjsp_sub_v2r8(iy2,jy2);
+            dz22             = _fjsp_sub_v2r8(iz2,jz2);
+            dx23             = _fjsp_sub_v2r8(ix2,jx3);
+            dy23             = _fjsp_sub_v2r8(iy2,jy3);
+            dz23             = _fjsp_sub_v2r8(iz2,jz3);
+            dx31             = _fjsp_sub_v2r8(ix3,jx1);
+            dy31             = _fjsp_sub_v2r8(iy3,jy1);
+            dz31             = _fjsp_sub_v2r8(iz3,jz1);
+            dx32             = _fjsp_sub_v2r8(ix3,jx2);
+            dy32             = _fjsp_sub_v2r8(iy3,jy2);
+            dz32             = _fjsp_sub_v2r8(iz3,jz2);
+            dx33             = _fjsp_sub_v2r8(ix3,jx3);
+            dy33             = _fjsp_sub_v2r8(iy3,jy3);
+            dz33             = _fjsp_sub_v2r8(iz3,jz3);
+
+            /* Calculate squared distance and things based on it */
+            rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+            rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+            rsq13            = gmx_fjsp_calc_rsq_v2r8(dx13,dy13,dz13);
+            rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+            rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+            rsq23            = gmx_fjsp_calc_rsq_v2r8(dx23,dy23,dz23);
+            rsq31            = gmx_fjsp_calc_rsq_v2r8(dx31,dy31,dz31);
+            rsq32            = gmx_fjsp_calc_rsq_v2r8(dx32,dy32,dz32);
+            rsq33            = gmx_fjsp_calc_rsq_v2r8(dx33,dy33,dz33);
+
+            rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+            rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+            rinv13           = gmx_fjsp_invsqrt_v2r8(rsq13);
+            rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+            rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+            rinv23           = gmx_fjsp_invsqrt_v2r8(rsq23);
+            rinv31           = gmx_fjsp_invsqrt_v2r8(rsq31);
+            rinv32           = gmx_fjsp_invsqrt_v2r8(rsq32);
+            rinv33           = gmx_fjsp_invsqrt_v2r8(rsq33);
+
+            rinvsq11         = _fjsp_mul_v2r8(rinv11,rinv11);
+            rinvsq12         = _fjsp_mul_v2r8(rinv12,rinv12);
+            rinvsq13         = _fjsp_mul_v2r8(rinv13,rinv13);
+            rinvsq21         = _fjsp_mul_v2r8(rinv21,rinv21);
+            rinvsq22         = _fjsp_mul_v2r8(rinv22,rinv22);
+            rinvsq23         = _fjsp_mul_v2r8(rinv23,rinv23);
+            rinvsq31         = _fjsp_mul_v2r8(rinv31,rinv31);
+            rinvsq32         = _fjsp_mul_v2r8(rinv32,rinv32);
+            rinvsq33         = _fjsp_mul_v2r8(rinv33,rinv33);
+
+            fjx1             = _fjsp_setzero_v2r8();
+            fjy1             = _fjsp_setzero_v2r8();
+            fjz1             = _fjsp_setzero_v2r8();
+            fjx2             = _fjsp_setzero_v2r8();
+            fjy2             = _fjsp_setzero_v2r8();
+            fjz2             = _fjsp_setzero_v2r8();
+            fjx3             = _fjsp_setzero_v2r8();
+            fjy3             = _fjsp_setzero_v2r8();
+            fjz3             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r11              = _fjsp_mul_v2r8(rsq11,rinv11);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r11,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq11,_fjsp_sub_v2r8(rinv11,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq11,rinv11),_fjsp_sub_v2r8(rinvsq11,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+            
+            fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r12              = _fjsp_mul_v2r8(rsq12,rinv12);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r12,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq12,_fjsp_sub_v2r8(rinv12,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq12,rinv12),_fjsp_sub_v2r8(rinvsq12,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+            
+            fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r13              = _fjsp_mul_v2r8(rsq13,rinv13);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r13,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq13,_fjsp_sub_v2r8(rinv13,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq13,rinv13),_fjsp_sub_v2r8(rinvsq13,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx13,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy13,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz13,fscal,fiz1);
+            
+            fjx3             = _fjsp_madd_v2r8(dx13,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy13,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz13,fscal,fjz3);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r21              = _fjsp_mul_v2r8(rsq21,rinv21);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r21,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq21,_fjsp_sub_v2r8(rinv21,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq21,rinv21),_fjsp_sub_v2r8(rinvsq21,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+            
+            fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r22              = _fjsp_mul_v2r8(rsq22,rinv22);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r22,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq22,_fjsp_sub_v2r8(rinv22,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq22,rinv22),_fjsp_sub_v2r8(rinvsq22,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+            
+            fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r23              = _fjsp_mul_v2r8(rsq23,rinv23);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r23,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq23,_fjsp_sub_v2r8(rinv23,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq23,rinv23),_fjsp_sub_v2r8(rinvsq23,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx23,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy23,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz23,fscal,fiz2);
+            
+            fjx3             = _fjsp_madd_v2r8(dx23,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy23,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz23,fscal,fjz3);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r31              = _fjsp_mul_v2r8(rsq31,rinv31);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r31,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq31,_fjsp_sub_v2r8(rinv31,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq31,rinv31),_fjsp_sub_v2r8(rinvsq31,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx31,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy31,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz31,fscal,fiz3);
+            
+            fjx1             = _fjsp_madd_v2r8(dx31,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy31,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz31,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r32              = _fjsp_mul_v2r8(rsq32,rinv32);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r32,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq32,_fjsp_sub_v2r8(rinv32,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq32,rinv32),_fjsp_sub_v2r8(rinvsq32,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx32,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy32,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz32,fscal,fiz3);
+            
+            fjx2             = _fjsp_madd_v2r8(dx32,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy32,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz32,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r33              = _fjsp_mul_v2r8(rsq33,rinv33);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r33,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq33,_fjsp_sub_v2r8(rinv33,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq33,rinv33),_fjsp_sub_v2r8(rinvsq33,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx33,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy33,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz33,fscal,fiz3);
+            
+            fjx3             = _fjsp_madd_v2r8(dx33,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy33,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz33,fscal,fjz3);
+
+            gmx_fjsp_decrement_3rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA+DIM,f+j_coord_offsetB+DIM,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+
+            /* Inner loop uses 396 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_3rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA+DIM,
+                                              &jx1,&jy1,&jz1,&jx2,&jy2,&jz2,&jx3,&jy3,&jz3);
+
+            /* Calculate displacement vector */
+            dx11             = _fjsp_sub_v2r8(ix1,jx1);
+            dy11             = _fjsp_sub_v2r8(iy1,jy1);
+            dz11             = _fjsp_sub_v2r8(iz1,jz1);
+            dx12             = _fjsp_sub_v2r8(ix1,jx2);
+            dy12             = _fjsp_sub_v2r8(iy1,jy2);
+            dz12             = _fjsp_sub_v2r8(iz1,jz2);
+            dx13             = _fjsp_sub_v2r8(ix1,jx3);
+            dy13             = _fjsp_sub_v2r8(iy1,jy3);
+            dz13             = _fjsp_sub_v2r8(iz1,jz3);
+            dx21             = _fjsp_sub_v2r8(ix2,jx1);
+            dy21             = _fjsp_sub_v2r8(iy2,jy1);
+            dz21             = _fjsp_sub_v2r8(iz2,jz1);
+            dx22             = _fjsp_sub_v2r8(ix2,jx2);
+            dy22             = _fjsp_sub_v2r8(iy2,jy2);
+            dz22             = _fjsp_sub_v2r8(iz2,jz2);
+            dx23             = _fjsp_sub_v2r8(ix2,jx3);
+            dy23             = _fjsp_sub_v2r8(iy2,jy3);
+            dz23             = _fjsp_sub_v2r8(iz2,jz3);
+            dx31             = _fjsp_sub_v2r8(ix3,jx1);
+            dy31             = _fjsp_sub_v2r8(iy3,jy1);
+            dz31             = _fjsp_sub_v2r8(iz3,jz1);
+            dx32             = _fjsp_sub_v2r8(ix3,jx2);
+            dy32             = _fjsp_sub_v2r8(iy3,jy2);
+            dz32             = _fjsp_sub_v2r8(iz3,jz2);
+            dx33             = _fjsp_sub_v2r8(ix3,jx3);
+            dy33             = _fjsp_sub_v2r8(iy3,jy3);
+            dz33             = _fjsp_sub_v2r8(iz3,jz3);
+
+            /* Calculate squared distance and things based on it */
+            rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+            rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+            rsq13            = gmx_fjsp_calc_rsq_v2r8(dx13,dy13,dz13);
+            rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+            rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+            rsq23            = gmx_fjsp_calc_rsq_v2r8(dx23,dy23,dz23);
+            rsq31            = gmx_fjsp_calc_rsq_v2r8(dx31,dy31,dz31);
+            rsq32            = gmx_fjsp_calc_rsq_v2r8(dx32,dy32,dz32);
+            rsq33            = gmx_fjsp_calc_rsq_v2r8(dx33,dy33,dz33);
+
+            rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+            rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+            rinv13           = gmx_fjsp_invsqrt_v2r8(rsq13);
+            rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+            rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+            rinv23           = gmx_fjsp_invsqrt_v2r8(rsq23);
+            rinv31           = gmx_fjsp_invsqrt_v2r8(rsq31);
+            rinv32           = gmx_fjsp_invsqrt_v2r8(rsq32);
+            rinv33           = gmx_fjsp_invsqrt_v2r8(rsq33);
+
+            rinvsq11         = _fjsp_mul_v2r8(rinv11,rinv11);
+            rinvsq12         = _fjsp_mul_v2r8(rinv12,rinv12);
+            rinvsq13         = _fjsp_mul_v2r8(rinv13,rinv13);
+            rinvsq21         = _fjsp_mul_v2r8(rinv21,rinv21);
+            rinvsq22         = _fjsp_mul_v2r8(rinv22,rinv22);
+            rinvsq23         = _fjsp_mul_v2r8(rinv23,rinv23);
+            rinvsq31         = _fjsp_mul_v2r8(rinv31,rinv31);
+            rinvsq32         = _fjsp_mul_v2r8(rinv32,rinv32);
+            rinvsq33         = _fjsp_mul_v2r8(rinv33,rinv33);
+
+            fjx1             = _fjsp_setzero_v2r8();
+            fjy1             = _fjsp_setzero_v2r8();
+            fjz1             = _fjsp_setzero_v2r8();
+            fjx2             = _fjsp_setzero_v2r8();
+            fjy2             = _fjsp_setzero_v2r8();
+            fjz2             = _fjsp_setzero_v2r8();
+            fjx3             = _fjsp_setzero_v2r8();
+            fjy3             = _fjsp_setzero_v2r8();
+            fjz3             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r11              = _fjsp_mul_v2r8(rsq11,rinv11);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r11,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq11,_fjsp_sub_v2r8(rinv11,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq11,rinv11),_fjsp_sub_v2r8(rinvsq11,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+            
+            fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r12              = _fjsp_mul_v2r8(rsq12,rinv12);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r12,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq12,_fjsp_sub_v2r8(rinv12,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq12,rinv12),_fjsp_sub_v2r8(rinvsq12,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+            
+            fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r13              = _fjsp_mul_v2r8(rsq13,rinv13);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r13,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq13,_fjsp_sub_v2r8(rinv13,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq13,rinv13),_fjsp_sub_v2r8(rinvsq13,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx13,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy13,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz13,fscal,fiz1);
+            
+            fjx3             = _fjsp_madd_v2r8(dx13,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy13,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz13,fscal,fjz3);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r21              = _fjsp_mul_v2r8(rsq21,rinv21);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r21,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq21,_fjsp_sub_v2r8(rinv21,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq21,rinv21),_fjsp_sub_v2r8(rinvsq21,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+            
+            fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r22              = _fjsp_mul_v2r8(rsq22,rinv22);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r22,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq22,_fjsp_sub_v2r8(rinv22,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq22,rinv22),_fjsp_sub_v2r8(rinvsq22,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+            
+            fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r23              = _fjsp_mul_v2r8(rsq23,rinv23);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r23,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq23,_fjsp_sub_v2r8(rinv23,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq23,rinv23),_fjsp_sub_v2r8(rinvsq23,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx23,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy23,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz23,fscal,fiz2);
+            
+            fjx3             = _fjsp_madd_v2r8(dx23,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy23,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz23,fscal,fjz3);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r31              = _fjsp_mul_v2r8(rsq31,rinv31);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r31,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq31,_fjsp_sub_v2r8(rinv31,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq31,rinv31),_fjsp_sub_v2r8(rinvsq31,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx31,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy31,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz31,fscal,fiz3);
+            
+            fjx1             = _fjsp_madd_v2r8(dx31,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy31,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz31,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r32              = _fjsp_mul_v2r8(rsq32,rinv32);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r32,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq32,_fjsp_sub_v2r8(rinv32,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq32,rinv32),_fjsp_sub_v2r8(rinvsq32,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx32,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy32,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz32,fscal,fiz3);
+            
+            fjx2             = _fjsp_madd_v2r8(dx32,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy32,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz32,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r33              = _fjsp_mul_v2r8(rsq33,rinv33);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r33,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq33,_fjsp_sub_v2r8(rinv33,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq33,rinv33),_fjsp_sub_v2r8(rinvsq33,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx33,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy33,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz33,fscal,fiz3);
+            
+            fjx3             = _fjsp_madd_v2r8(dx33,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy33,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz33,fscal,fjz3);
+
+            gmx_fjsp_decrement_3rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA+DIM,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+
+            /* Inner loop uses 396 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_fjsp_update_iforce_3atom_swizzle_v2r8(fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+                                              f+i_coord_offset+DIM,fshift+i_shift_offset);
+
+        ggid                        = gid[iidx];
+        /* Update potential energies */
+        gmx_fjsp_update_1pot_v2r8(velecsum,kernel_data->energygrp_elec+ggid);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 19 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W4W4_VF,outeriter*19 + inneriter*396);
+}
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecEw_VdwNone_GeomW4W4_F_sparc64_hpc_ace_double
+ * Electrostatics interaction: Ewald
+ * VdW interaction:            None
+ * Geometry:                   Water4-Water4
+ * Calculate force/pot:        Force
+ */
+void
+nb_kernel_ElecEw_VdwNone_GeomW4W4_F_sparc64_hpc_ace_double
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset1;
+    _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwioffset3;
+    _fjsp_v2r8       ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+    int              vdwjidx1A,vdwjidx1B;
+    _fjsp_v2r8       jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+    int              vdwjidx2A,vdwjidx2B;
+    _fjsp_v2r8       jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+    int              vdwjidx3A,vdwjidx3B;
+    _fjsp_v2r8       jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
+    _fjsp_v2r8       dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+    _fjsp_v2r8       dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+    _fjsp_v2r8       dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
+    _fjsp_v2r8       dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+    _fjsp_v2r8       dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+    _fjsp_v2r8       dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
+    _fjsp_v2r8       dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
+    _fjsp_v2r8       dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
+    _fjsp_v2r8       dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
+    _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    _fjsp_v2r8       ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+    real             *ewtab;
+    _fjsp_v2r8       itab_tmp;
+    _fjsp_v2r8       dummy_mask,cutoff_mask;
+    _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+    _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+    union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+    charge           = mdatoms->chargeA;
+
+    sh_ewald         = gmx_fjsp_set1_v2r8(fr->ic->sh_ewald);
+    ewtab            = fr->ic->tabq_coul_F;
+    ewtabscale       = gmx_fjsp_set1_v2r8(fr->ic->tabq_scale);
+    ewtabhalfspace   = gmx_fjsp_set1_v2r8(0.5/fr->ic->tabq_scale);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+    iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+    iq3              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+3]));
+
+    jq1              = gmx_fjsp_set1_v2r8(charge[inr+1]);
+    jq2              = gmx_fjsp_set1_v2r8(charge[inr+2]);
+    jq3              = gmx_fjsp_set1_v2r8(charge[inr+3]);
+    qq11             = _fjsp_mul_v2r8(iq1,jq1);
+    qq12             = _fjsp_mul_v2r8(iq1,jq2);
+    qq13             = _fjsp_mul_v2r8(iq1,jq3);
+    qq21             = _fjsp_mul_v2r8(iq2,jq1);
+    qq22             = _fjsp_mul_v2r8(iq2,jq2);
+    qq23             = _fjsp_mul_v2r8(iq2,jq3);
+    qq31             = _fjsp_mul_v2r8(iq3,jq1);
+    qq32             = _fjsp_mul_v2r8(iq3,jq2);
+    qq33             = _fjsp_mul_v2r8(iq3,jq3);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_fjsp_load_shift_and_3rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset+DIM,
+                                                 &ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
+
+        fix1             = _fjsp_setzero_v2r8();
+        fiy1             = _fjsp_setzero_v2r8();
+        fiz1             = _fjsp_setzero_v2r8();
+        fix2             = _fjsp_setzero_v2r8();
+        fiy2             = _fjsp_setzero_v2r8();
+        fiz2             = _fjsp_setzero_v2r8();
+        fix3             = _fjsp_setzero_v2r8();
+        fiy3             = _fjsp_setzero_v2r8();
+        fiz3             = _fjsp_setzero_v2r8();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_3rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA+DIM,x+j_coord_offsetB+DIM,
+                                              &jx1,&jy1,&jz1,&jx2,&jy2,&jz2,&jx3,&jy3,&jz3);
+
+            /* Calculate displacement vector */
+            dx11             = _fjsp_sub_v2r8(ix1,jx1);
+            dy11             = _fjsp_sub_v2r8(iy1,jy1);
+            dz11             = _fjsp_sub_v2r8(iz1,jz1);
+            dx12             = _fjsp_sub_v2r8(ix1,jx2);
+            dy12             = _fjsp_sub_v2r8(iy1,jy2);
+            dz12             = _fjsp_sub_v2r8(iz1,jz2);
+            dx13             = _fjsp_sub_v2r8(ix1,jx3);
+            dy13             = _fjsp_sub_v2r8(iy1,jy3);
+            dz13             = _fjsp_sub_v2r8(iz1,jz3);
+            dx21             = _fjsp_sub_v2r8(ix2,jx1);
+            dy21             = _fjsp_sub_v2r8(iy2,jy1);
+            dz21             = _fjsp_sub_v2r8(iz2,jz1);
+            dx22             = _fjsp_sub_v2r8(ix2,jx2);
+            dy22             = _fjsp_sub_v2r8(iy2,jy2);
+            dz22             = _fjsp_sub_v2r8(iz2,jz2);
+            dx23             = _fjsp_sub_v2r8(ix2,jx3);
+            dy23             = _fjsp_sub_v2r8(iy2,jy3);
+            dz23             = _fjsp_sub_v2r8(iz2,jz3);
+            dx31             = _fjsp_sub_v2r8(ix3,jx1);
+            dy31             = _fjsp_sub_v2r8(iy3,jy1);
+            dz31             = _fjsp_sub_v2r8(iz3,jz1);
+            dx32             = _fjsp_sub_v2r8(ix3,jx2);
+            dy32             = _fjsp_sub_v2r8(iy3,jy2);
+            dz32             = _fjsp_sub_v2r8(iz3,jz2);
+            dx33             = _fjsp_sub_v2r8(ix3,jx3);
+            dy33             = _fjsp_sub_v2r8(iy3,jy3);
+            dz33             = _fjsp_sub_v2r8(iz3,jz3);
+
+            /* Calculate squared distance and things based on it */
+            rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+            rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+            rsq13            = gmx_fjsp_calc_rsq_v2r8(dx13,dy13,dz13);
+            rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+            rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+            rsq23            = gmx_fjsp_calc_rsq_v2r8(dx23,dy23,dz23);
+            rsq31            = gmx_fjsp_calc_rsq_v2r8(dx31,dy31,dz31);
+            rsq32            = gmx_fjsp_calc_rsq_v2r8(dx32,dy32,dz32);
+            rsq33            = gmx_fjsp_calc_rsq_v2r8(dx33,dy33,dz33);
+
+            rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+            rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+            rinv13           = gmx_fjsp_invsqrt_v2r8(rsq13);
+            rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+            rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+            rinv23           = gmx_fjsp_invsqrt_v2r8(rsq23);
+            rinv31           = gmx_fjsp_invsqrt_v2r8(rsq31);
+            rinv32           = gmx_fjsp_invsqrt_v2r8(rsq32);
+            rinv33           = gmx_fjsp_invsqrt_v2r8(rsq33);
+
+            rinvsq11         = _fjsp_mul_v2r8(rinv11,rinv11);
+            rinvsq12         = _fjsp_mul_v2r8(rinv12,rinv12);
+            rinvsq13         = _fjsp_mul_v2r8(rinv13,rinv13);
+            rinvsq21         = _fjsp_mul_v2r8(rinv21,rinv21);
+            rinvsq22         = _fjsp_mul_v2r8(rinv22,rinv22);
+            rinvsq23         = _fjsp_mul_v2r8(rinv23,rinv23);
+            rinvsq31         = _fjsp_mul_v2r8(rinv31,rinv31);
+            rinvsq32         = _fjsp_mul_v2r8(rinv32,rinv32);
+            rinvsq33         = _fjsp_mul_v2r8(rinv33,rinv33);
+
+            fjx1             = _fjsp_setzero_v2r8();
+            fjy1             = _fjsp_setzero_v2r8();
+            fjz1             = _fjsp_setzero_v2r8();
+            fjx2             = _fjsp_setzero_v2r8();
+            fjy2             = _fjsp_setzero_v2r8();
+            fjz2             = _fjsp_setzero_v2r8();
+            fjx3             = _fjsp_setzero_v2r8();
+            fjy3             = _fjsp_setzero_v2r8();
+            fjz3             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r11              = _fjsp_mul_v2r8(rsq11,rinv11);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r11,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
+                                         &ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq11,rinv11),_fjsp_sub_v2r8(rinvsq11,felec));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+            
+            fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r12              = _fjsp_mul_v2r8(rsq12,rinv12);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r12,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
+                                         &ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq12,rinv12),_fjsp_sub_v2r8(rinvsq12,felec));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+            
+            fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r13              = _fjsp_mul_v2r8(rsq13,rinv13);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r13,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
+                                         &ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq13,rinv13),_fjsp_sub_v2r8(rinvsq13,felec));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx13,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy13,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz13,fscal,fiz1);
+            
+            fjx3             = _fjsp_madd_v2r8(dx13,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy13,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz13,fscal,fjz3);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r21              = _fjsp_mul_v2r8(rsq21,rinv21);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r21,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
+                                         &ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq21,rinv21),_fjsp_sub_v2r8(rinvsq21,felec));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+            
+            fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r22              = _fjsp_mul_v2r8(rsq22,rinv22);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r22,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
+                                         &ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq22,rinv22),_fjsp_sub_v2r8(rinvsq22,felec));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+            
+            fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r23              = _fjsp_mul_v2r8(rsq23,rinv23);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r23,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
+                                         &ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq23,rinv23),_fjsp_sub_v2r8(rinvsq23,felec));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx23,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy23,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz23,fscal,fiz2);
+            
+            fjx3             = _fjsp_madd_v2r8(dx23,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy23,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz23,fscal,fjz3);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r31              = _fjsp_mul_v2r8(rsq31,rinv31);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r31,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
+                                         &ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq31,rinv31),_fjsp_sub_v2r8(rinvsq31,felec));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx31,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy31,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz31,fscal,fiz3);
+            
+            fjx1             = _fjsp_madd_v2r8(dx31,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy31,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz31,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r32              = _fjsp_mul_v2r8(rsq32,rinv32);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r32,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
+                                         &ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq32,rinv32),_fjsp_sub_v2r8(rinvsq32,felec));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx32,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy32,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz32,fscal,fiz3);
+            
+            fjx2             = _fjsp_madd_v2r8(dx32,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy32,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz32,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r33              = _fjsp_mul_v2r8(rsq33,rinv33);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r33,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
+                                         &ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq33,rinv33),_fjsp_sub_v2r8(rinvsq33,felec));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx33,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy33,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz33,fscal,fiz3);
+            
+            fjx3             = _fjsp_madd_v2r8(dx33,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy33,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz33,fscal,fjz3);
+
+            gmx_fjsp_decrement_3rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA+DIM,f+j_coord_offsetB+DIM,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+
+            /* Inner loop uses 351 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_3rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA+DIM,
+                                              &jx1,&jy1,&jz1,&jx2,&jy2,&jz2,&jx3,&jy3,&jz3);
+
+            /* Calculate displacement vector */
+            dx11             = _fjsp_sub_v2r8(ix1,jx1);
+            dy11             = _fjsp_sub_v2r8(iy1,jy1);
+            dz11             = _fjsp_sub_v2r8(iz1,jz1);
+            dx12             = _fjsp_sub_v2r8(ix1,jx2);
+            dy12             = _fjsp_sub_v2r8(iy1,jy2);
+            dz12             = _fjsp_sub_v2r8(iz1,jz2);
+            dx13             = _fjsp_sub_v2r8(ix1,jx3);
+            dy13             = _fjsp_sub_v2r8(iy1,jy3);
+            dz13             = _fjsp_sub_v2r8(iz1,jz3);
+            dx21             = _fjsp_sub_v2r8(ix2,jx1);
+            dy21             = _fjsp_sub_v2r8(iy2,jy1);
+            dz21             = _fjsp_sub_v2r8(iz2,jz1);
+            dx22             = _fjsp_sub_v2r8(ix2,jx2);
+            dy22             = _fjsp_sub_v2r8(iy2,jy2);
+            dz22             = _fjsp_sub_v2r8(iz2,jz2);
+            dx23             = _fjsp_sub_v2r8(ix2,jx3);
+            dy23             = _fjsp_sub_v2r8(iy2,jy3);
+            dz23             = _fjsp_sub_v2r8(iz2,jz3);
+            dx31             = _fjsp_sub_v2r8(ix3,jx1);
+            dy31             = _fjsp_sub_v2r8(iy3,jy1);
+            dz31             = _fjsp_sub_v2r8(iz3,jz1);
+            dx32             = _fjsp_sub_v2r8(ix3,jx2);
+            dy32             = _fjsp_sub_v2r8(iy3,jy2);
+            dz32             = _fjsp_sub_v2r8(iz3,jz2);
+            dx33             = _fjsp_sub_v2r8(ix3,jx3);
+            dy33             = _fjsp_sub_v2r8(iy3,jy3);
+            dz33             = _fjsp_sub_v2r8(iz3,jz3);
+
+            /* Calculate squared distance and things based on it */
+            rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+            rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+            rsq13            = gmx_fjsp_calc_rsq_v2r8(dx13,dy13,dz13);
+            rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+            rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+            rsq23            = gmx_fjsp_calc_rsq_v2r8(dx23,dy23,dz23);
+            rsq31            = gmx_fjsp_calc_rsq_v2r8(dx31,dy31,dz31);
+            rsq32            = gmx_fjsp_calc_rsq_v2r8(dx32,dy32,dz32);
+            rsq33            = gmx_fjsp_calc_rsq_v2r8(dx33,dy33,dz33);
+
+            rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+            rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+            rinv13           = gmx_fjsp_invsqrt_v2r8(rsq13);
+            rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+            rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+            rinv23           = gmx_fjsp_invsqrt_v2r8(rsq23);
+            rinv31           = gmx_fjsp_invsqrt_v2r8(rsq31);
+            rinv32           = gmx_fjsp_invsqrt_v2r8(rsq32);
+            rinv33           = gmx_fjsp_invsqrt_v2r8(rsq33);
+
+            rinvsq11         = _fjsp_mul_v2r8(rinv11,rinv11);
+            rinvsq12         = _fjsp_mul_v2r8(rinv12,rinv12);
+            rinvsq13         = _fjsp_mul_v2r8(rinv13,rinv13);
+            rinvsq21         = _fjsp_mul_v2r8(rinv21,rinv21);
+            rinvsq22         = _fjsp_mul_v2r8(rinv22,rinv22);
+            rinvsq23         = _fjsp_mul_v2r8(rinv23,rinv23);
+            rinvsq31         = _fjsp_mul_v2r8(rinv31,rinv31);
+            rinvsq32         = _fjsp_mul_v2r8(rinv32,rinv32);
+            rinvsq33         = _fjsp_mul_v2r8(rinv33,rinv33);
+
+            fjx1             = _fjsp_setzero_v2r8();
+            fjy1             = _fjsp_setzero_v2r8();
+            fjz1             = _fjsp_setzero_v2r8();
+            fjx2             = _fjsp_setzero_v2r8();
+            fjy2             = _fjsp_setzero_v2r8();
+            fjz2             = _fjsp_setzero_v2r8();
+            fjx3             = _fjsp_setzero_v2r8();
+            fjy3             = _fjsp_setzero_v2r8();
+            fjz3             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r11              = _fjsp_mul_v2r8(rsq11,rinv11);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r11,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq11,rinv11),_fjsp_sub_v2r8(rinvsq11,felec));
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+            
+            fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r12              = _fjsp_mul_v2r8(rsq12,rinv12);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r12,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq12,rinv12),_fjsp_sub_v2r8(rinvsq12,felec));
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+            
+            fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r13              = _fjsp_mul_v2r8(rsq13,rinv13);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r13,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq13,rinv13),_fjsp_sub_v2r8(rinvsq13,felec));
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx13,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy13,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz13,fscal,fiz1);
+            
+            fjx3             = _fjsp_madd_v2r8(dx13,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy13,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz13,fscal,fjz3);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r21              = _fjsp_mul_v2r8(rsq21,rinv21);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r21,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq21,rinv21),_fjsp_sub_v2r8(rinvsq21,felec));
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+            
+            fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r22              = _fjsp_mul_v2r8(rsq22,rinv22);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r22,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq22,rinv22),_fjsp_sub_v2r8(rinvsq22,felec));
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+            
+            fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r23              = _fjsp_mul_v2r8(rsq23,rinv23);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r23,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq23,rinv23),_fjsp_sub_v2r8(rinvsq23,felec));
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx23,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy23,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz23,fscal,fiz2);
+            
+            fjx3             = _fjsp_madd_v2r8(dx23,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy23,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz23,fscal,fjz3);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r31              = _fjsp_mul_v2r8(rsq31,rinv31);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r31,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq31,rinv31),_fjsp_sub_v2r8(rinvsq31,felec));
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx31,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy31,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz31,fscal,fiz3);
+            
+            fjx1             = _fjsp_madd_v2r8(dx31,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy31,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz31,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r32              = _fjsp_mul_v2r8(rsq32,rinv32);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r32,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq32,rinv32),_fjsp_sub_v2r8(rinvsq32,felec));
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx32,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy32,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz32,fscal,fiz3);
+            
+            fjx2             = _fjsp_madd_v2r8(dx32,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy32,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz32,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r33              = _fjsp_mul_v2r8(rsq33,rinv33);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r33,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq33,rinv33),_fjsp_sub_v2r8(rinvsq33,felec));
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx33,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy33,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz33,fscal,fiz3);
+            
+            fjx3             = _fjsp_madd_v2r8(dx33,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy33,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz33,fscal,fjz3);
+
+            gmx_fjsp_decrement_3rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA+DIM,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+
+            /* Inner loop uses 351 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_fjsp_update_iforce_3atom_swizzle_v2r8(fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+                                              f+i_coord_offset+DIM,fshift+i_shift_offset);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 18 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W4W4_F,outeriter*18 + inneriter*351);
+}
diff --git a/src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecGB_VdwCSTab_GeomP1P1_sparc64_hpc_ace_double.c b/src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecGB_VdwCSTab_GeomP1P1_sparc64_hpc_ace_double.c
new file mode 100644 (file)
index 0000000..93a7338
--- /dev/null
@@ -0,0 +1,820 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2012, by the GROMACS development team, led by
+ * David van der Spoel, Berk Hess, Erik Lindahl, and including many
+ * others, as listed in the AUTHORS file in the top-level source
+ * directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*
+ * Note: this file was generated by the GROMACS sparc64_hpc_ace_double kernel generator.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "kernelutil_sparc64_hpc_ace_double.h"
+
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecGB_VdwCSTab_GeomP1P1_VF_sparc64_hpc_ace_double
+ * Electrostatics interaction: GeneralizedBorn
+ * VdW interaction:            CubicSplineTable
+ * Geometry:                   Particle-Particle
+ * Calculate force/pot:        PotentialAndForce
+ */
+void
+nb_kernel_ElecGB_VdwCSTab_GeomP1P1_VF_sparc64_hpc_ace_double
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwjidx0A,vdwjidx0B;
+    _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    _fjsp_v2r8       vgb,fgb,vgbsum,dvdasum,gbscale,gbtabscale,isaprod,gbqqfactor,gbinvepsdiff,dvdaj,gbeps,twogbeps,dvdatmp;
+    _fjsp_v2r8       minushalf = gmx_fjsp_set1_v2r8(-0.5);
+    real             *invsqrta,*dvda,*gbtab;
+    int              nvdwtype;
+    _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
+    _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
+    _fjsp_v2r8       rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF,twovfeps;
+    real             *vftab;
+    _fjsp_v2r8       itab_tmp;
+    _fjsp_v2r8       dummy_mask,cutoff_mask;
+    _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+    _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+    union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+
+    vftab            = kernel_data->table_vdw->data;
+    vftabscale       = gmx_fjsp_set1_v2r8(kernel_data->table_vdw->scale);
+
+    invsqrta         = fr->invsqrta;
+    dvda             = fr->dvda;
+    gbtabscale       = gmx_fjsp_set1_v2r8(fr->gbtab.scale);
+    gbtab            = fr->gbtab.data;
+    gbinvepsdiff     = gmx_fjsp_set1_v2r8((1.0/fr->epsilon_r) - (1.0/fr->gb_epsilon_solvent));
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_fjsp_load_shift_and_1rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,&ix0,&iy0,&iz0);
+
+        fix0             = _fjsp_setzero_v2r8();
+        fiy0             = _fjsp_setzero_v2r8();
+        fiz0             = _fjsp_setzero_v2r8();
+
+        /* Load parameters for i particles */
+        iq0              = _fjsp_mul_v2r8(facel,gmx_fjsp_load1_v2r8(charge+inr+0));
+        isai0            = gmx_fjsp_load1_v2r8(invsqrta+inr+0);
+        vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+        /* Reset potential sums */
+        velecsum         = _fjsp_setzero_v2r8();
+        vgbsum           = _fjsp_setzero_v2r8();
+        vvdwsum          = _fjsp_setzero_v2r8();
+        dvdasum          = _fjsp_setzero_v2r8();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_fjsp_load_2real_swizzle_v2r8(charge+jnrA+0,charge+jnrB+0);
+            isaj0            = gmx_fjsp_load_2real_swizzle_v2r8(invsqrta+jnrA+0,invsqrta+jnrB+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _fjsp_mul_v2r8(iq0,jq0);
+            gmx_fjsp_load_2pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,&c6_00,&c12_00);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r00,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 8;
+            vfconv.i[1]     *= 8;
+
+            /* GENERALIZED BORN AND COULOMB ELECTROSTATICS */
+            isaprod          = _fjsp_mul_v2r8(isai0,isaj0);
+            gbqqfactor       = _fjsp_neg_v2r8(_fjsp_mul_v2r8(qq00,_fjsp_mul_v2r8(isaprod,gbinvepsdiff)));
+            gbscale          = _fjsp_mul_v2r8(isaprod,gbtabscale);
+
+            /* Calculate generalized born table index - this is a separate table from the normal one,
+             * but we use the same procedure by multiplying r with scale and truncating to integer.
+             */
+            rt               = _fjsp_mul_v2r8(r00,gbscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            gbeps            = _fjsp_sub_v2r8(rt,_fjsp_xtod_v2r8(itab_tmp));
+            _fjsp_store_v2r8(&gbconv.simd,itab_tmp);
+
+            Y                = _fjsp_load_v2r8( gbtab + 4*gbconv.i[0] );
+            F                = _fjsp_load_v2r8( gbtab + 4*gbconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( gbtab + 4*gbconv.i[0] +2);
+            H                = _fjsp_load_v2r8( gbtab + 4*gbconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(gbeps,_fjsp_madd_v2r8(gbeps,H,G),F);
+            VV               = _fjsp_madd_v2r8(gbeps,Fp,Y);
+            vgb              = _fjsp_mul_v2r8(gbqqfactor,VV);
+
+            twogbeps         = _fjsp_add_v2r8(gbeps,gbeps);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twogbeps,H,G),gbeps,Fp);
+            fgb              = _fjsp_mul_v2r8(gbqqfactor,_fjsp_mul_v2r8(FF,gbscale));
+            dvdatmp          = _fjsp_mul_v2r8(minushalf,_fjsp_madd_v2r8(fgb,r00,vgb));
+            dvdasum          = _fjsp_add_v2r8(dvdasum,dvdatmp);
+            gmx_fjsp_increment_2real_swizzle_v2r8(dvda+jnrA,dvda+jnrB,_fjsp_mul_v2r8(dvdatmp,_fjsp_mul_v2r8(isaj0,isaj0)));
+            velec            = _fjsp_mul_v2r8(qq00,rinv00);
+            felec            = _fjsp_mul_v2r8(_fjsp_msub_v2r8(velec,rinv00,fgb),rinv00);
+
+            /* CUBIC SPLINE TABLE DISPERSION */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 2 );
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 2 );
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            vvdw6            = _fjsp_mul_v2r8(c6_00,VV);
+            FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+            fvdw6            = _fjsp_mul_v2r8(c6_00,FF);
+
+            /* CUBIC SPLINE TABLE REPULSION */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 4 );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 4 );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 6 );
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 6 );
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            vvdw12           = _fjsp_mul_v2r8(c12_00,VV);
+            FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+            fvdw12           = _fjsp_mul_v2r8(c12_00,FF);
+            vvdw             = _fjsp_add_v2r8(vvdw12,vvdw6);
+            fvdw             = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_add_v2r8(fvdw6,fvdw12),_fjsp_mul_v2r8(vftabscale,rinv00)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+            vgbsum           = _fjsp_add_v2r8(vgbsum,vgb);
+            vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
+
+            fscal            = _fjsp_add_v2r8(felec,fvdw);
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            gmx_fjsp_decrement_fma_1rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fscal,dx00,dy00,dz00);
+
+            /* Inner loop uses 95 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+
+            /* Load parameters for j particles */
+            jq0              = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),charge+jnrA+0);
+            isaj0            = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),invsqrta+jnrA+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _fjsp_mul_v2r8(iq0,jq0);
+            gmx_fjsp_load_1pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,&c6_00,&c12_00);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r00,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 8;
+            vfconv.i[1]     *= 8;
+
+            /* GENERALIZED BORN AND COULOMB ELECTROSTATICS */
+            isaprod          = _fjsp_mul_v2r8(isai0,isaj0);
+            gbqqfactor       = _fjsp_neg_v2r8(_fjsp_mul_v2r8(qq00,_fjsp_mul_v2r8(isaprod,gbinvepsdiff)));
+            gbscale          = _fjsp_mul_v2r8(isaprod,gbtabscale);
+
+            /* Calculate generalized born table index - this is a separate table from the normal one,
+             * but we use the same procedure by multiplying r with scale and truncating to integer.
+             */
+            rt               = _fjsp_mul_v2r8(r00,gbscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            gbeps            = _fjsp_sub_v2r8(rt,_fjsp_xtod_v2r8(itab_tmp));
+            _fjsp_store_v2r8(&gbconv.simd,itab_tmp);
+
+            Y                = _fjsp_load_v2r8( gbtab + 4*gbconv.i[0] );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( gbtab + 4*gbconv.i[0] +2);
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(gbeps,_fjsp_madd_v2r8(gbeps,H,G),F);
+            VV               = _fjsp_madd_v2r8(gbeps,Fp,Y);
+            vgb              = _fjsp_mul_v2r8(gbqqfactor,VV);
+
+            twogbeps         = _fjsp_add_v2r8(gbeps,gbeps);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twogbeps,H,G),gbeps,Fp);
+            fgb              = _fjsp_mul_v2r8(gbqqfactor,_fjsp_mul_v2r8(FF,gbscale));
+            dvdatmp          = _fjsp_mul_v2r8(minushalf,_fjsp_madd_v2r8(fgb,r00,vgb));
+            dvdasum          = _fjsp_add_v2r8(dvdasum,dvdatmp);
+            gmx_fjsp_increment_1real_v2r8(dvda+jnrA,_fjsp_mul_v2r8(dvdatmp,_fjsp_mul_v2r8(isaj0,isaj0)));
+            velec            = _fjsp_mul_v2r8(qq00,rinv00);
+            felec            = _fjsp_mul_v2r8(_fjsp_msub_v2r8(velec,rinv00,fgb),rinv00);
+
+            /* CUBIC SPLINE TABLE DISPERSION */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 2 );
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            vvdw6            = _fjsp_mul_v2r8(c6_00,VV);
+            FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+            fvdw6            = _fjsp_mul_v2r8(c6_00,FF);
+
+            /* CUBIC SPLINE TABLE REPULSION */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 4 );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 6 );
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            vvdw12           = _fjsp_mul_v2r8(c12_00,VV);
+            FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+            fvdw12           = _fjsp_mul_v2r8(c12_00,FF);
+            vvdw             = _fjsp_add_v2r8(vvdw12,vvdw6);
+            fvdw             = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_add_v2r8(fvdw6,fvdw12),_fjsp_mul_v2r8(vftabscale,rinv00)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+            vgb              = _fjsp_unpacklo_v2r8(vgb,_fjsp_setzero_v2r8());
+            vgbsum           = _fjsp_add_v2r8(vgbsum,vgb);
+            vvdw             = _fjsp_unpacklo_v2r8(vvdw,_fjsp_setzero_v2r8());
+            vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
+
+            fscal            = _fjsp_add_v2r8(felec,fvdw);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            gmx_fjsp_decrement_fma_1rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fscal,dx00,dy00,dz00);
+
+            /* Inner loop uses 95 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_fjsp_update_iforce_1atom_swizzle_v2r8(fix0,fiy0,fiz0,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        ggid                        = gid[iidx];
+        /* Update potential energies */
+        gmx_fjsp_update_1pot_v2r8(velecsum,kernel_data->energygrp_elec+ggid);
+        gmx_fjsp_update_1pot_v2r8(vgbsum,kernel_data->energygrp_polarization+ggid);
+        gmx_fjsp_update_1pot_v2r8(vvdwsum,kernel_data->energygrp_vdw+ggid);
+        dvdasum = _fjsp_mul_v2r8(dvdasum, _fjsp_mul_v2r8(isai0,isai0));
+        gmx_fjsp_update_1pot_v2r8(dvdasum,dvda+inr);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 10 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_VF,outeriter*10 + inneriter*95);
+}
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecGB_VdwCSTab_GeomP1P1_F_sparc64_hpc_ace_double
+ * Electrostatics interaction: GeneralizedBorn
+ * VdW interaction:            CubicSplineTable
+ * Geometry:                   Particle-Particle
+ * Calculate force/pot:        Force
+ */
+void
+nb_kernel_ElecGB_VdwCSTab_GeomP1P1_F_sparc64_hpc_ace_double
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwjidx0A,vdwjidx0B;
+    _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    _fjsp_v2r8       vgb,fgb,vgbsum,dvdasum,gbscale,gbtabscale,isaprod,gbqqfactor,gbinvepsdiff,dvdaj,gbeps,twogbeps,dvdatmp;
+    _fjsp_v2r8       minushalf = gmx_fjsp_set1_v2r8(-0.5);
+    real             *invsqrta,*dvda,*gbtab;
+    int              nvdwtype;
+    _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
+    _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
+    _fjsp_v2r8       rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF,twovfeps;
+    real             *vftab;
+    _fjsp_v2r8       itab_tmp;
+    _fjsp_v2r8       dummy_mask,cutoff_mask;
+    _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+    _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+    union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+
+    vftab            = kernel_data->table_vdw->data;
+    vftabscale       = gmx_fjsp_set1_v2r8(kernel_data->table_vdw->scale);
+
+    invsqrta         = fr->invsqrta;
+    dvda             = fr->dvda;
+    gbtabscale       = gmx_fjsp_set1_v2r8(fr->gbtab.scale);
+    gbtab            = fr->gbtab.data;
+    gbinvepsdiff     = gmx_fjsp_set1_v2r8((1.0/fr->epsilon_r) - (1.0/fr->gb_epsilon_solvent));
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_fjsp_load_shift_and_1rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,&ix0,&iy0,&iz0);
+
+        fix0             = _fjsp_setzero_v2r8();
+        fiy0             = _fjsp_setzero_v2r8();
+        fiz0             = _fjsp_setzero_v2r8();
+
+        /* Load parameters for i particles */
+        iq0              = _fjsp_mul_v2r8(facel,gmx_fjsp_load1_v2r8(charge+inr+0));
+        isai0            = gmx_fjsp_load1_v2r8(invsqrta+inr+0);
+        vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+        dvdasum          = _fjsp_setzero_v2r8();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_fjsp_load_2real_swizzle_v2r8(charge+jnrA+0,charge+jnrB+0);
+            isaj0            = gmx_fjsp_load_2real_swizzle_v2r8(invsqrta+jnrA+0,invsqrta+jnrB+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _fjsp_mul_v2r8(iq0,jq0);
+            gmx_fjsp_load_2pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,&c6_00,&c12_00);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r00,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 8;
+            vfconv.i[1]     *= 8;
+
+            /* GENERALIZED BORN AND COULOMB ELECTROSTATICS */
+            isaprod          = _fjsp_mul_v2r8(isai0,isaj0);
+            gbqqfactor       = _fjsp_neg_v2r8(_fjsp_mul_v2r8(qq00,_fjsp_mul_v2r8(isaprod,gbinvepsdiff)));
+            gbscale          = _fjsp_mul_v2r8(isaprod,gbtabscale);
+
+            /* Calculate generalized born table index - this is a separate table from the normal one,
+             * but we use the same procedure by multiplying r with scale and truncating to integer.
+             */
+            rt               = _fjsp_mul_v2r8(r00,gbscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            gbeps            = _fjsp_sub_v2r8(rt,_fjsp_xtod_v2r8(itab_tmp));
+            _fjsp_store_v2r8(&gbconv.simd,itab_tmp);
+
+            Y                = _fjsp_load_v2r8( gbtab + 4*gbconv.i[0] );
+            F                = _fjsp_load_v2r8( gbtab + 4*gbconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( gbtab + 4*gbconv.i[0] +2);
+            H                = _fjsp_load_v2r8( gbtab + 4*gbconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(gbeps,_fjsp_madd_v2r8(gbeps,H,G),F);
+            VV               = _fjsp_madd_v2r8(gbeps,Fp,Y);
+            vgb              = _fjsp_mul_v2r8(gbqqfactor,VV);
+
+            twogbeps         = _fjsp_add_v2r8(gbeps,gbeps);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twogbeps,H,G),gbeps,Fp);
+            fgb              = _fjsp_mul_v2r8(gbqqfactor,_fjsp_mul_v2r8(FF,gbscale));
+            dvdatmp          = _fjsp_mul_v2r8(minushalf,_fjsp_madd_v2r8(fgb,r00,vgb));
+            dvdasum          = _fjsp_add_v2r8(dvdasum,dvdatmp);
+            gmx_fjsp_increment_2real_swizzle_v2r8(dvda+jnrA,dvda+jnrB,_fjsp_mul_v2r8(dvdatmp,_fjsp_mul_v2r8(isaj0,isaj0)));
+            velec            = _fjsp_mul_v2r8(qq00,rinv00);
+            felec            = _fjsp_mul_v2r8(_fjsp_msub_v2r8(velec,rinv00,fgb),rinv00);
+
+            /* CUBIC SPLINE TABLE DISPERSION */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 2 );
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 2 );
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+            FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+            fvdw6            = _fjsp_mul_v2r8(c6_00,FF);
+
+            /* CUBIC SPLINE TABLE REPULSION */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 4 );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 4 );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 6 );
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 6 );
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+            FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+            fvdw12           = _fjsp_mul_v2r8(c12_00,FF);
+            fvdw             = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_add_v2r8(fvdw6,fvdw12),_fjsp_mul_v2r8(vftabscale,rinv00)));
+
+            fscal            = _fjsp_add_v2r8(felec,fvdw);
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            gmx_fjsp_decrement_fma_1rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fscal,dx00,dy00,dz00);
+
+            /* Inner loop uses 85 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+
+            /* Load parameters for j particles */
+            jq0              = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),charge+jnrA+0);
+            isaj0            = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),invsqrta+jnrA+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _fjsp_mul_v2r8(iq0,jq0);
+            gmx_fjsp_load_1pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,&c6_00,&c12_00);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r00,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 8;
+            vfconv.i[1]     *= 8;
+
+            /* GENERALIZED BORN AND COULOMB ELECTROSTATICS */
+            isaprod          = _fjsp_mul_v2r8(isai0,isaj0);
+            gbqqfactor       = _fjsp_neg_v2r8(_fjsp_mul_v2r8(qq00,_fjsp_mul_v2r8(isaprod,gbinvepsdiff)));
+            gbscale          = _fjsp_mul_v2r8(isaprod,gbtabscale);
+
+            /* Calculate generalized born table index - this is a separate table from the normal one,
+             * but we use the same procedure by multiplying r with scale and truncating to integer.
+             */
+            rt               = _fjsp_mul_v2r8(r00,gbscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            gbeps            = _fjsp_sub_v2r8(rt,_fjsp_xtod_v2r8(itab_tmp));
+            _fjsp_store_v2r8(&gbconv.simd,itab_tmp);
+
+            Y                = _fjsp_load_v2r8( gbtab + 4*gbconv.i[0] );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( gbtab + 4*gbconv.i[0] +2);
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(gbeps,_fjsp_madd_v2r8(gbeps,H,G),F);
+            VV               = _fjsp_madd_v2r8(gbeps,Fp,Y);
+            vgb              = _fjsp_mul_v2r8(gbqqfactor,VV);
+
+            twogbeps         = _fjsp_add_v2r8(gbeps,gbeps);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twogbeps,H,G),gbeps,Fp);
+            fgb              = _fjsp_mul_v2r8(gbqqfactor,_fjsp_mul_v2r8(FF,gbscale));
+            dvdatmp          = _fjsp_mul_v2r8(minushalf,_fjsp_madd_v2r8(fgb,r00,vgb));
+            dvdasum          = _fjsp_add_v2r8(dvdasum,dvdatmp);
+            gmx_fjsp_increment_1real_v2r8(dvda+jnrA,_fjsp_mul_v2r8(dvdatmp,_fjsp_mul_v2r8(isaj0,isaj0)));
+            velec            = _fjsp_mul_v2r8(qq00,rinv00);
+            felec            = _fjsp_mul_v2r8(_fjsp_msub_v2r8(velec,rinv00,fgb),rinv00);
+
+            /* CUBIC SPLINE TABLE DISPERSION */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 2 );
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+            FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+            fvdw6            = _fjsp_mul_v2r8(c6_00,FF);
+
+            /* CUBIC SPLINE TABLE REPULSION */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 4 );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 6 );
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+            FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+            fvdw12           = _fjsp_mul_v2r8(c12_00,FF);
+            fvdw             = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_add_v2r8(fvdw6,fvdw12),_fjsp_mul_v2r8(vftabscale,rinv00)));
+
+            fscal            = _fjsp_add_v2r8(felec,fvdw);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            gmx_fjsp_decrement_fma_1rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fscal,dx00,dy00,dz00);
+
+            /* Inner loop uses 85 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_fjsp_update_iforce_1atom_swizzle_v2r8(fix0,fiy0,fiz0,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        dvdasum = _fjsp_mul_v2r8(dvdasum, _fjsp_mul_v2r8(isai0,isai0));
+        gmx_fjsp_update_1pot_v2r8(dvdasum,dvda+inr);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 7 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_F,outeriter*7 + inneriter*85);
+}
diff --git a/src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecGB_VdwLJ_GeomP1P1_sparc64_hpc_ace_double.c b/src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecGB_VdwLJ_GeomP1P1_sparc64_hpc_ace_double.c
new file mode 100644 (file)
index 0000000..0a58534
--- /dev/null
@@ -0,0 +1,706 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2012, by the GROMACS development team, led by
+ * David van der Spoel, Berk Hess, Erik Lindahl, and including many
+ * others, as listed in the AUTHORS file in the top-level source
+ * directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*
+ * Note: this file was generated by the GROMACS sparc64_hpc_ace_double kernel generator.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "kernelutil_sparc64_hpc_ace_double.h"
+
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecGB_VdwLJ_GeomP1P1_VF_sparc64_hpc_ace_double
+ * Electrostatics interaction: GeneralizedBorn
+ * VdW interaction:            LennardJones
+ * Geometry:                   Particle-Particle
+ * Calculate force/pot:        PotentialAndForce
+ */
+void
+nb_kernel_ElecGB_VdwLJ_GeomP1P1_VF_sparc64_hpc_ace_double
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwjidx0A,vdwjidx0B;
+    _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    _fjsp_v2r8       vgb,fgb,vgbsum,dvdasum,gbscale,gbtabscale,isaprod,gbqqfactor,gbinvepsdiff,dvdaj,gbeps,twogbeps,dvdatmp;
+    _fjsp_v2r8       minushalf = gmx_fjsp_set1_v2r8(-0.5);
+    real             *invsqrta,*dvda,*gbtab;
+    int              nvdwtype;
+    _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
+    _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
+    _fjsp_v2r8       rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF,twovfeps;
+    real             *vftab;
+    _fjsp_v2r8       itab_tmp;
+    _fjsp_v2r8       dummy_mask,cutoff_mask;
+    _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+    _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+    union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+
+    invsqrta         = fr->invsqrta;
+    dvda             = fr->dvda;
+    gbtabscale       = gmx_fjsp_set1_v2r8(fr->gbtab.scale);
+    gbtab            = fr->gbtab.data;
+    gbinvepsdiff     = gmx_fjsp_set1_v2r8((1.0/fr->epsilon_r) - (1.0/fr->gb_epsilon_solvent));
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_fjsp_load_shift_and_1rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,&ix0,&iy0,&iz0);
+
+        fix0             = _fjsp_setzero_v2r8();
+        fiy0             = _fjsp_setzero_v2r8();
+        fiz0             = _fjsp_setzero_v2r8();
+
+        /* Load parameters for i particles */
+        iq0              = _fjsp_mul_v2r8(facel,gmx_fjsp_load1_v2r8(charge+inr+0));
+        isai0            = gmx_fjsp_load1_v2r8(invsqrta+inr+0);
+        vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+        /* Reset potential sums */
+        velecsum         = _fjsp_setzero_v2r8();
+        vgbsum           = _fjsp_setzero_v2r8();
+        vvdwsum          = _fjsp_setzero_v2r8();
+        dvdasum          = _fjsp_setzero_v2r8();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_fjsp_load_2real_swizzle_v2r8(charge+jnrA+0,charge+jnrB+0);
+            isaj0            = gmx_fjsp_load_2real_swizzle_v2r8(invsqrta+jnrA+0,invsqrta+jnrB+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _fjsp_mul_v2r8(iq0,jq0);
+            gmx_fjsp_load_2pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,&c6_00,&c12_00);
+
+            /* GENERALIZED BORN AND COULOMB ELECTROSTATICS */
+            isaprod          = _fjsp_mul_v2r8(isai0,isaj0);
+            gbqqfactor       = _fjsp_neg_v2r8(_fjsp_mul_v2r8(qq00,_fjsp_mul_v2r8(isaprod,gbinvepsdiff)));
+            gbscale          = _fjsp_mul_v2r8(isaprod,gbtabscale);
+
+            /* Calculate generalized born table index - this is a separate table from the normal one,
+             * but we use the same procedure by multiplying r with scale and truncating to integer.
+             */
+            rt               = _fjsp_mul_v2r8(r00,gbscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            gbeps            = _fjsp_sub_v2r8(rt,_fjsp_xtod_v2r8(itab_tmp));
+            _fjsp_store_v2r8(&gbconv.simd,itab_tmp);
+
+            Y                = _fjsp_load_v2r8( gbtab + 4*gbconv.i[0] );
+            F                = _fjsp_load_v2r8( gbtab + 4*gbconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( gbtab + 4*gbconv.i[0] +2);
+            H                = _fjsp_load_v2r8( gbtab + 4*gbconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(gbeps,_fjsp_madd_v2r8(gbeps,H,G),F);
+            VV               = _fjsp_madd_v2r8(gbeps,Fp,Y);
+            vgb              = _fjsp_mul_v2r8(gbqqfactor,VV);
+
+            twogbeps         = _fjsp_add_v2r8(gbeps,gbeps);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twogbeps,H,G),gbeps,Fp);
+            fgb              = _fjsp_mul_v2r8(gbqqfactor,_fjsp_mul_v2r8(FF,gbscale));
+            dvdatmp          = _fjsp_mul_v2r8(minushalf,_fjsp_madd_v2r8(fgb,r00,vgb));
+            dvdasum          = _fjsp_add_v2r8(dvdasum,dvdatmp);
+            gmx_fjsp_increment_2real_swizzle_v2r8(dvda+jnrA,dvda+jnrB,_fjsp_mul_v2r8(dvdatmp,_fjsp_mul_v2r8(isaj0,isaj0)));
+            velec            = _fjsp_mul_v2r8(qq00,rinv00);
+            felec            = _fjsp_mul_v2r8(_fjsp_msub_v2r8(velec,rinv00,fgb),rinv00);
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+            vvdw6            = _fjsp_mul_v2r8(c6_00,rinvsix);
+            vvdw12           = _fjsp_mul_v2r8(c12_00,_fjsp_mul_v2r8(rinvsix,rinvsix));
+            vvdw             = _fjsp_msub_v2r8( vvdw12,one_twelfth, _fjsp_mul_v2r8(vvdw6,one_sixth) );
+            fvdw             = _fjsp_mul_v2r8(_fjsp_sub_v2r8(vvdw12,vvdw6),rinvsq00);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+            vgbsum           = _fjsp_add_v2r8(vgbsum,vgb);
+            vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
+
+            fscal            = _fjsp_add_v2r8(felec,fvdw);
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            gmx_fjsp_decrement_fma_1rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fscal,dx00,dy00,dz00);
+
+            /* Inner loop uses 74 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+
+            /* Load parameters for j particles */
+            jq0              = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),charge+jnrA+0);
+            isaj0            = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),invsqrta+jnrA+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _fjsp_mul_v2r8(iq0,jq0);
+            gmx_fjsp_load_1pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,&c6_00,&c12_00);
+
+            /* GENERALIZED BORN AND COULOMB ELECTROSTATICS */
+            isaprod          = _fjsp_mul_v2r8(isai0,isaj0);
+            gbqqfactor       = _fjsp_neg_v2r8(_fjsp_mul_v2r8(qq00,_fjsp_mul_v2r8(isaprod,gbinvepsdiff)));
+            gbscale          = _fjsp_mul_v2r8(isaprod,gbtabscale);
+
+            /* Calculate generalized born table index - this is a separate table from the normal one,
+             * but we use the same procedure by multiplying r with scale and truncating to integer.
+             */
+            rt               = _fjsp_mul_v2r8(r00,gbscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            gbeps            = _fjsp_sub_v2r8(rt,_fjsp_xtod_v2r8(itab_tmp));
+            _fjsp_store_v2r8(&gbconv.simd,itab_tmp);
+
+            Y                = _fjsp_load_v2r8( gbtab + 4*gbconv.i[0] );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( gbtab + 4*gbconv.i[0] +2);
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(gbeps,_fjsp_madd_v2r8(gbeps,H,G),F);
+            VV               = _fjsp_madd_v2r8(gbeps,Fp,Y);
+            vgb              = _fjsp_mul_v2r8(gbqqfactor,VV);
+
+            twogbeps         = _fjsp_add_v2r8(gbeps,gbeps);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twogbeps,H,G),gbeps,Fp);
+            fgb              = _fjsp_mul_v2r8(gbqqfactor,_fjsp_mul_v2r8(FF,gbscale));
+            dvdatmp          = _fjsp_mul_v2r8(minushalf,_fjsp_madd_v2r8(fgb,r00,vgb));
+            dvdasum          = _fjsp_add_v2r8(dvdasum,dvdatmp);
+            gmx_fjsp_increment_1real_v2r8(dvda+jnrA,_fjsp_mul_v2r8(dvdatmp,_fjsp_mul_v2r8(isaj0,isaj0)));
+            velec            = _fjsp_mul_v2r8(qq00,rinv00);
+            felec            = _fjsp_mul_v2r8(_fjsp_msub_v2r8(velec,rinv00,fgb),rinv00);
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+            vvdw6            = _fjsp_mul_v2r8(c6_00,rinvsix);
+            vvdw12           = _fjsp_mul_v2r8(c12_00,_fjsp_mul_v2r8(rinvsix,rinvsix));
+            vvdw             = _fjsp_msub_v2r8( vvdw12,one_twelfth, _fjsp_mul_v2r8(vvdw6,one_sixth) );
+            fvdw             = _fjsp_mul_v2r8(_fjsp_sub_v2r8(vvdw12,vvdw6),rinvsq00);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+            vgb              = _fjsp_unpacklo_v2r8(vgb,_fjsp_setzero_v2r8());
+            vgbsum           = _fjsp_add_v2r8(vgbsum,vgb);
+            vvdw             = _fjsp_unpacklo_v2r8(vvdw,_fjsp_setzero_v2r8());
+            vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
+
+            fscal            = _fjsp_add_v2r8(felec,fvdw);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            gmx_fjsp_decrement_fma_1rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fscal,dx00,dy00,dz00);
+
+            /* Inner loop uses 74 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_fjsp_update_iforce_1atom_swizzle_v2r8(fix0,fiy0,fiz0,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        ggid                        = gid[iidx];
+        /* Update potential energies */
+        gmx_fjsp_update_1pot_v2r8(velecsum,kernel_data->energygrp_elec+ggid);
+        gmx_fjsp_update_1pot_v2r8(vgbsum,kernel_data->energygrp_polarization+ggid);
+        gmx_fjsp_update_1pot_v2r8(vvdwsum,kernel_data->energygrp_vdw+ggid);
+        dvdasum = _fjsp_mul_v2r8(dvdasum, _fjsp_mul_v2r8(isai0,isai0));
+        gmx_fjsp_update_1pot_v2r8(dvdasum,dvda+inr);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 10 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_VF,outeriter*10 + inneriter*74);
+}
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecGB_VdwLJ_GeomP1P1_F_sparc64_hpc_ace_double
+ * Electrostatics interaction: GeneralizedBorn
+ * VdW interaction:            LennardJones
+ * Geometry:                   Particle-Particle
+ * Calculate force/pot:        Force
+ */
+void
+nb_kernel_ElecGB_VdwLJ_GeomP1P1_F_sparc64_hpc_ace_double
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwjidx0A,vdwjidx0B;
+    _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    _fjsp_v2r8       vgb,fgb,vgbsum,dvdasum,gbscale,gbtabscale,isaprod,gbqqfactor,gbinvepsdiff,dvdaj,gbeps,twogbeps,dvdatmp;
+    _fjsp_v2r8       minushalf = gmx_fjsp_set1_v2r8(-0.5);
+    real             *invsqrta,*dvda,*gbtab;
+    int              nvdwtype;
+    _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
+    _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
+    _fjsp_v2r8       rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF,twovfeps;
+    real             *vftab;
+    _fjsp_v2r8       itab_tmp;
+    _fjsp_v2r8       dummy_mask,cutoff_mask;
+    _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+    _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+    union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+
+    invsqrta         = fr->invsqrta;
+    dvda             = fr->dvda;
+    gbtabscale       = gmx_fjsp_set1_v2r8(fr->gbtab.scale);
+    gbtab            = fr->gbtab.data;
+    gbinvepsdiff     = gmx_fjsp_set1_v2r8((1.0/fr->epsilon_r) - (1.0/fr->gb_epsilon_solvent));
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_fjsp_load_shift_and_1rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,&ix0,&iy0,&iz0);
+
+        fix0             = _fjsp_setzero_v2r8();
+        fiy0             = _fjsp_setzero_v2r8();
+        fiz0             = _fjsp_setzero_v2r8();
+
+        /* Load parameters for i particles */
+        iq0              = _fjsp_mul_v2r8(facel,gmx_fjsp_load1_v2r8(charge+inr+0));
+        isai0            = gmx_fjsp_load1_v2r8(invsqrta+inr+0);
+        vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+        dvdasum          = _fjsp_setzero_v2r8();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_fjsp_load_2real_swizzle_v2r8(charge+jnrA+0,charge+jnrB+0);
+            isaj0            = gmx_fjsp_load_2real_swizzle_v2r8(invsqrta+jnrA+0,invsqrta+jnrB+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _fjsp_mul_v2r8(iq0,jq0);
+            gmx_fjsp_load_2pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,&c6_00,&c12_00);
+
+            /* GENERALIZED BORN AND COULOMB ELECTROSTATICS */
+            isaprod          = _fjsp_mul_v2r8(isai0,isaj0);
+            gbqqfactor       = _fjsp_neg_v2r8(_fjsp_mul_v2r8(qq00,_fjsp_mul_v2r8(isaprod,gbinvepsdiff)));
+            gbscale          = _fjsp_mul_v2r8(isaprod,gbtabscale);
+
+            /* Calculate generalized born table index - this is a separate table from the normal one,
+             * but we use the same procedure by multiplying r with scale and truncating to integer.
+             */
+            rt               = _fjsp_mul_v2r8(r00,gbscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            gbeps            = _fjsp_sub_v2r8(rt,_fjsp_xtod_v2r8(itab_tmp));
+            _fjsp_store_v2r8(&gbconv.simd,itab_tmp);
+
+            Y                = _fjsp_load_v2r8( gbtab + 4*gbconv.i[0] );
+            F                = _fjsp_load_v2r8( gbtab + 4*gbconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( gbtab + 4*gbconv.i[0] +2);
+            H                = _fjsp_load_v2r8( gbtab + 4*gbconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(gbeps,_fjsp_madd_v2r8(gbeps,H,G),F);
+            VV               = _fjsp_madd_v2r8(gbeps,Fp,Y);
+            vgb              = _fjsp_mul_v2r8(gbqqfactor,VV);
+
+            twogbeps         = _fjsp_add_v2r8(gbeps,gbeps);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twogbeps,H,G),gbeps,Fp);
+            fgb              = _fjsp_mul_v2r8(gbqqfactor,_fjsp_mul_v2r8(FF,gbscale));
+            dvdatmp          = _fjsp_mul_v2r8(minushalf,_fjsp_madd_v2r8(fgb,r00,vgb));
+            dvdasum          = _fjsp_add_v2r8(dvdasum,dvdatmp);
+            gmx_fjsp_increment_2real_swizzle_v2r8(dvda+jnrA,dvda+jnrB,_fjsp_mul_v2r8(dvdatmp,_fjsp_mul_v2r8(isaj0,isaj0)));
+            velec            = _fjsp_mul_v2r8(qq00,rinv00);
+            felec            = _fjsp_mul_v2r8(_fjsp_msub_v2r8(velec,rinv00,fgb),rinv00);
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+            fvdw             = _fjsp_mul_v2r8(_fjsp_msub_v2r8(c12_00,rinvsix,c6_00),_fjsp_mul_v2r8(rinvsix,rinvsq00));
+
+            fscal            = _fjsp_add_v2r8(felec,fvdw);
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            gmx_fjsp_decrement_fma_1rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fscal,dx00,dy00,dz00);
+
+            /* Inner loop uses 67 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+
+            /* Load parameters for j particles */
+            jq0              = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),charge+jnrA+0);
+            isaj0            = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),invsqrta+jnrA+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _fjsp_mul_v2r8(iq0,jq0);
+            gmx_fjsp_load_1pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,&c6_00,&c12_00);
+
+            /* GENERALIZED BORN AND COULOMB ELECTROSTATICS */
+            isaprod          = _fjsp_mul_v2r8(isai0,isaj0);
+            gbqqfactor       = _fjsp_neg_v2r8(_fjsp_mul_v2r8(qq00,_fjsp_mul_v2r8(isaprod,gbinvepsdiff)));
+            gbscale          = _fjsp_mul_v2r8(isaprod,gbtabscale);
+
+            /* Calculate generalized born table index - this is a separate table from the normal one,
+             * but we use the same procedure by multiplying r with scale and truncating to integer.
+             */
+            rt               = _fjsp_mul_v2r8(r00,gbscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            gbeps            = _fjsp_sub_v2r8(rt,_fjsp_xtod_v2r8(itab_tmp));
+            _fjsp_store_v2r8(&gbconv.simd,itab_tmp);
+
+            Y                = _fjsp_load_v2r8( gbtab + 4*gbconv.i[0] );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( gbtab + 4*gbconv.i[0] +2);
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(gbeps,_fjsp_madd_v2r8(gbeps,H,G),F);
+            VV               = _fjsp_madd_v2r8(gbeps,Fp,Y);
+            vgb              = _fjsp_mul_v2r8(gbqqfactor,VV);
+
+            twogbeps         = _fjsp_add_v2r8(gbeps,gbeps);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twogbeps,H,G),gbeps,Fp);
+            fgb              = _fjsp_mul_v2r8(gbqqfactor,_fjsp_mul_v2r8(FF,gbscale));
+            dvdatmp          = _fjsp_mul_v2r8(minushalf,_fjsp_madd_v2r8(fgb,r00,vgb));
+            dvdasum          = _fjsp_add_v2r8(dvdasum,dvdatmp);
+            gmx_fjsp_increment_1real_v2r8(dvda+jnrA,_fjsp_mul_v2r8(dvdatmp,_fjsp_mul_v2r8(isaj0,isaj0)));
+            velec            = _fjsp_mul_v2r8(qq00,rinv00);
+            felec            = _fjsp_mul_v2r8(_fjsp_msub_v2r8(velec,rinv00,fgb),rinv00);
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+            fvdw             = _fjsp_mul_v2r8(_fjsp_msub_v2r8(c12_00,rinvsix,c6_00),_fjsp_mul_v2r8(rinvsix,rinvsq00));
+
+            fscal            = _fjsp_add_v2r8(felec,fvdw);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            gmx_fjsp_decrement_fma_1rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fscal,dx00,dy00,dz00);
+
+            /* Inner loop uses 67 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_fjsp_update_iforce_1atom_swizzle_v2r8(fix0,fiy0,fiz0,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        dvdasum = _fjsp_mul_v2r8(dvdasum, _fjsp_mul_v2r8(isai0,isai0));
+        gmx_fjsp_update_1pot_v2r8(dvdasum,dvda+inr);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 7 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_F,outeriter*7 + inneriter*67);
+}
diff --git a/src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecGB_VdwNone_GeomP1P1_sparc64_hpc_ace_double.c b/src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecGB_VdwNone_GeomP1P1_sparc64_hpc_ace_double.c
new file mode 100644 (file)
index 0000000..bc2317e
--- /dev/null
@@ -0,0 +1,635 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2012, by the GROMACS development team, led by
+ * David van der Spoel, Berk Hess, Erik Lindahl, and including many
+ * others, as listed in the AUTHORS file in the top-level source
+ * directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*
+ * Note: this file was generated by the GROMACS sparc64_hpc_ace_double kernel generator.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "kernelutil_sparc64_hpc_ace_double.h"
+
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecGB_VdwNone_GeomP1P1_VF_sparc64_hpc_ace_double
+ * Electrostatics interaction: GeneralizedBorn
+ * VdW interaction:            None
+ * Geometry:                   Particle-Particle
+ * Calculate force/pot:        PotentialAndForce
+ */
+void
+nb_kernel_ElecGB_VdwNone_GeomP1P1_VF_sparc64_hpc_ace_double
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwjidx0A,vdwjidx0B;
+    _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    _fjsp_v2r8       vgb,fgb,vgbsum,dvdasum,gbscale,gbtabscale,isaprod,gbqqfactor,gbinvepsdiff,dvdaj,gbeps,twogbeps,dvdatmp;
+    _fjsp_v2r8       minushalf = gmx_fjsp_set1_v2r8(-0.5);
+    real             *invsqrta,*dvda,*gbtab;
+    _fjsp_v2r8       rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF,twovfeps;
+    real             *vftab;
+    _fjsp_v2r8       itab_tmp;
+    _fjsp_v2r8       dummy_mask,cutoff_mask;
+    _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+    _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+    union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+    charge           = mdatoms->chargeA;
+
+    invsqrta         = fr->invsqrta;
+    dvda             = fr->dvda;
+    gbtabscale       = gmx_fjsp_set1_v2r8(fr->gbtab.scale);
+    gbtab            = fr->gbtab.data;
+    gbinvepsdiff     = gmx_fjsp_set1_v2r8((1.0/fr->epsilon_r) - (1.0/fr->gb_epsilon_solvent));
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_fjsp_load_shift_and_1rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,&ix0,&iy0,&iz0);
+
+        fix0             = _fjsp_setzero_v2r8();
+        fiy0             = _fjsp_setzero_v2r8();
+        fiz0             = _fjsp_setzero_v2r8();
+
+        /* Load parameters for i particles */
+        iq0              = _fjsp_mul_v2r8(facel,gmx_fjsp_load1_v2r8(charge+inr+0));
+        isai0            = gmx_fjsp_load1_v2r8(invsqrta+inr+0);
+
+        /* Reset potential sums */
+        velecsum         = _fjsp_setzero_v2r8();
+        vgbsum           = _fjsp_setzero_v2r8();
+        dvdasum          = _fjsp_setzero_v2r8();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_fjsp_load_2real_swizzle_v2r8(charge+jnrA+0,charge+jnrB+0);
+            isaj0            = gmx_fjsp_load_2real_swizzle_v2r8(invsqrta+jnrA+0,invsqrta+jnrB+0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _fjsp_mul_v2r8(iq0,jq0);
+
+            /* GENERALIZED BORN AND COULOMB ELECTROSTATICS */
+            isaprod          = _fjsp_mul_v2r8(isai0,isaj0);
+            gbqqfactor       = _fjsp_neg_v2r8(_fjsp_mul_v2r8(qq00,_fjsp_mul_v2r8(isaprod,gbinvepsdiff)));
+            gbscale          = _fjsp_mul_v2r8(isaprod,gbtabscale);
+
+            /* Calculate generalized born table index - this is a separate table from the normal one,
+             * but we use the same procedure by multiplying r with scale and truncating to integer.
+             */
+            rt               = _fjsp_mul_v2r8(r00,gbscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            gbeps            = _fjsp_sub_v2r8(rt,_fjsp_xtod_v2r8(itab_tmp));
+            _fjsp_store_v2r8(&gbconv.simd,itab_tmp);
+
+            Y                = _fjsp_load_v2r8( gbtab + 4*gbconv.i[0] );
+            F                = _fjsp_load_v2r8( gbtab + 4*gbconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( gbtab + 4*gbconv.i[0] +2);
+            H                = _fjsp_load_v2r8( gbtab + 4*gbconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(gbeps,_fjsp_madd_v2r8(gbeps,H,G),F);
+            VV               = _fjsp_madd_v2r8(gbeps,Fp,Y);
+            vgb              = _fjsp_mul_v2r8(gbqqfactor,VV);
+
+            twogbeps         = _fjsp_add_v2r8(gbeps,gbeps);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twogbeps,H,G),gbeps,Fp);
+            fgb              = _fjsp_mul_v2r8(gbqqfactor,_fjsp_mul_v2r8(FF,gbscale));
+            dvdatmp          = _fjsp_mul_v2r8(minushalf,_fjsp_madd_v2r8(fgb,r00,vgb));
+            dvdasum          = _fjsp_add_v2r8(dvdasum,dvdatmp);
+            gmx_fjsp_increment_2real_swizzle_v2r8(dvda+jnrA,dvda+jnrB,_fjsp_mul_v2r8(dvdatmp,_fjsp_mul_v2r8(isaj0,isaj0)));
+            velec            = _fjsp_mul_v2r8(qq00,rinv00);
+            felec            = _fjsp_mul_v2r8(_fjsp_msub_v2r8(velec,rinv00,fgb),rinv00);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+            vgbsum           = _fjsp_add_v2r8(vgbsum,vgb);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            gmx_fjsp_decrement_fma_1rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fscal,dx00,dy00,dz00);
+
+            /* Inner loop uses 61 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+
+            /* Load parameters for j particles */
+            jq0              = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),charge+jnrA+0);
+            isaj0            = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),invsqrta+jnrA+0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _fjsp_mul_v2r8(iq0,jq0);
+
+            /* GENERALIZED BORN AND COULOMB ELECTROSTATICS */
+            isaprod          = _fjsp_mul_v2r8(isai0,isaj0);
+            gbqqfactor       = _fjsp_neg_v2r8(_fjsp_mul_v2r8(qq00,_fjsp_mul_v2r8(isaprod,gbinvepsdiff)));
+            gbscale          = _fjsp_mul_v2r8(isaprod,gbtabscale);
+
+            /* Calculate generalized born table index - this is a separate table from the normal one,
+             * but we use the same procedure by multiplying r with scale and truncating to integer.
+             */
+            rt               = _fjsp_mul_v2r8(r00,gbscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            gbeps            = _fjsp_sub_v2r8(rt,_fjsp_xtod_v2r8(itab_tmp));
+            _fjsp_store_v2r8(&gbconv.simd,itab_tmp);
+
+            Y                = _fjsp_load_v2r8( gbtab + 4*gbconv.i[0] );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( gbtab + 4*gbconv.i[0] +2);
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(gbeps,_fjsp_madd_v2r8(gbeps,H,G),F);
+            VV               = _fjsp_madd_v2r8(gbeps,Fp,Y);
+            vgb              = _fjsp_mul_v2r8(gbqqfactor,VV);
+
+            twogbeps         = _fjsp_add_v2r8(gbeps,gbeps);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twogbeps,H,G),gbeps,Fp);
+            fgb              = _fjsp_mul_v2r8(gbqqfactor,_fjsp_mul_v2r8(FF,gbscale));
+            dvdatmp          = _fjsp_mul_v2r8(minushalf,_fjsp_madd_v2r8(fgb,r00,vgb));
+            dvdasum          = _fjsp_add_v2r8(dvdasum,dvdatmp);
+            gmx_fjsp_increment_1real_v2r8(dvda+jnrA,_fjsp_mul_v2r8(dvdatmp,_fjsp_mul_v2r8(isaj0,isaj0)));
+            velec            = _fjsp_mul_v2r8(qq00,rinv00);
+            felec            = _fjsp_mul_v2r8(_fjsp_msub_v2r8(velec,rinv00,fgb),rinv00);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+            vgb              = _fjsp_unpacklo_v2r8(vgb,_fjsp_setzero_v2r8());
+            vgbsum           = _fjsp_add_v2r8(vgbsum,vgb);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            gmx_fjsp_decrement_fma_1rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fscal,dx00,dy00,dz00);
+
+            /* Inner loop uses 61 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_fjsp_update_iforce_1atom_swizzle_v2r8(fix0,fiy0,fiz0,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        ggid                        = gid[iidx];
+        /* Update potential energies */
+        gmx_fjsp_update_1pot_v2r8(velecsum,kernel_data->energygrp_elec+ggid);
+        gmx_fjsp_update_1pot_v2r8(vgbsum,kernel_data->energygrp_polarization+ggid);
+        dvdasum = _fjsp_mul_v2r8(dvdasum, _fjsp_mul_v2r8(isai0,isai0));
+        gmx_fjsp_update_1pot_v2r8(dvdasum,dvda+inr);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 9 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VF,outeriter*9 + inneriter*61);
+}
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecGB_VdwNone_GeomP1P1_F_sparc64_hpc_ace_double
+ * Electrostatics interaction: GeneralizedBorn
+ * VdW interaction:            None
+ * Geometry:                   Particle-Particle
+ * Calculate force/pot:        Force
+ */
+void
+nb_kernel_ElecGB_VdwNone_GeomP1P1_F_sparc64_hpc_ace_double
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwjidx0A,vdwjidx0B;
+    _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    _fjsp_v2r8       vgb,fgb,vgbsum,dvdasum,gbscale,gbtabscale,isaprod,gbqqfactor,gbinvepsdiff,dvdaj,gbeps,twogbeps,dvdatmp;
+    _fjsp_v2r8       minushalf = gmx_fjsp_set1_v2r8(-0.5);
+    real             *invsqrta,*dvda,*gbtab;
+    _fjsp_v2r8       rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF,twovfeps;
+    real             *vftab;
+    _fjsp_v2r8       itab_tmp;
+    _fjsp_v2r8       dummy_mask,cutoff_mask;
+    _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+    _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+    union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+    charge           = mdatoms->chargeA;
+
+    invsqrta         = fr->invsqrta;
+    dvda             = fr->dvda;
+    gbtabscale       = gmx_fjsp_set1_v2r8(fr->gbtab.scale);
+    gbtab            = fr->gbtab.data;
+    gbinvepsdiff     = gmx_fjsp_set1_v2r8((1.0/fr->epsilon_r) - (1.0/fr->gb_epsilon_solvent));
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_fjsp_load_shift_and_1rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,&ix0,&iy0,&iz0);
+
+        fix0             = _fjsp_setzero_v2r8();
+        fiy0             = _fjsp_setzero_v2r8();
+        fiz0             = _fjsp_setzero_v2r8();
+
+        /* Load parameters for i particles */
+        iq0              = _fjsp_mul_v2r8(facel,gmx_fjsp_load1_v2r8(charge+inr+0));
+        isai0            = gmx_fjsp_load1_v2r8(invsqrta+inr+0);
+
+        dvdasum          = _fjsp_setzero_v2r8();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_fjsp_load_2real_swizzle_v2r8(charge+jnrA+0,charge+jnrB+0);
+            isaj0            = gmx_fjsp_load_2real_swizzle_v2r8(invsqrta+jnrA+0,invsqrta+jnrB+0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _fjsp_mul_v2r8(iq0,jq0);
+
+            /* GENERALIZED BORN AND COULOMB ELECTROSTATICS */
+            isaprod          = _fjsp_mul_v2r8(isai0,isaj0);
+            gbqqfactor       = _fjsp_neg_v2r8(_fjsp_mul_v2r8(qq00,_fjsp_mul_v2r8(isaprod,gbinvepsdiff)));
+            gbscale          = _fjsp_mul_v2r8(isaprod,gbtabscale);
+
+            /* Calculate generalized born table index - this is a separate table from the normal one,
+             * but we use the same procedure by multiplying r with scale and truncating to integer.
+             */
+            rt               = _fjsp_mul_v2r8(r00,gbscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            gbeps            = _fjsp_sub_v2r8(rt,_fjsp_xtod_v2r8(itab_tmp));
+            _fjsp_store_v2r8(&gbconv.simd,itab_tmp);
+
+            Y                = _fjsp_load_v2r8( gbtab + 4*gbconv.i[0] );
+            F                = _fjsp_load_v2r8( gbtab + 4*gbconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( gbtab + 4*gbconv.i[0] +2);
+            H                = _fjsp_load_v2r8( gbtab + 4*gbconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(gbeps,_fjsp_madd_v2r8(gbeps,H,G),F);
+            VV               = _fjsp_madd_v2r8(gbeps,Fp,Y);
+            vgb              = _fjsp_mul_v2r8(gbqqfactor,VV);
+
+            twogbeps         = _fjsp_add_v2r8(gbeps,gbeps);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twogbeps,H,G),gbeps,Fp);
+            fgb              = _fjsp_mul_v2r8(gbqqfactor,_fjsp_mul_v2r8(FF,gbscale));
+            dvdatmp          = _fjsp_mul_v2r8(minushalf,_fjsp_madd_v2r8(fgb,r00,vgb));
+            dvdasum          = _fjsp_add_v2r8(dvdasum,dvdatmp);
+            gmx_fjsp_increment_2real_swizzle_v2r8(dvda+jnrA,dvda+jnrB,_fjsp_mul_v2r8(dvdatmp,_fjsp_mul_v2r8(isaj0,isaj0)));
+            velec            = _fjsp_mul_v2r8(qq00,rinv00);
+            felec            = _fjsp_mul_v2r8(_fjsp_msub_v2r8(velec,rinv00,fgb),rinv00);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            gmx_fjsp_decrement_fma_1rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fscal,dx00,dy00,dz00);
+
+            /* Inner loop uses 59 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+
+            /* Load parameters for j particles */
+            jq0              = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),charge+jnrA+0);
+            isaj0            = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),invsqrta+jnrA+0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _fjsp_mul_v2r8(iq0,jq0);
+
+            /* GENERALIZED BORN AND COULOMB ELECTROSTATICS */
+            isaprod          = _fjsp_mul_v2r8(isai0,isaj0);
+            gbqqfactor       = _fjsp_neg_v2r8(_fjsp_mul_v2r8(qq00,_fjsp_mul_v2r8(isaprod,gbinvepsdiff)));
+            gbscale          = _fjsp_mul_v2r8(isaprod,gbtabscale);
+
+            /* Calculate generalized born table index - this is a separate table from the normal one,
+             * but we use the same procedure by multiplying r with scale and truncating to integer.
+             */
+            rt               = _fjsp_mul_v2r8(r00,gbscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            gbeps            = _fjsp_sub_v2r8(rt,_fjsp_xtod_v2r8(itab_tmp));
+            _fjsp_store_v2r8(&gbconv.simd,itab_tmp);
+
+            Y                = _fjsp_load_v2r8( gbtab + 4*gbconv.i[0] );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( gbtab + 4*gbconv.i[0] +2);
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(gbeps,_fjsp_madd_v2r8(gbeps,H,G),F);
+            VV               = _fjsp_madd_v2r8(gbeps,Fp,Y);
+            vgb              = _fjsp_mul_v2r8(gbqqfactor,VV);
+
+            twogbeps         = _fjsp_add_v2r8(gbeps,gbeps);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twogbeps,H,G),gbeps,Fp);
+            fgb              = _fjsp_mul_v2r8(gbqqfactor,_fjsp_mul_v2r8(FF,gbscale));
+            dvdatmp          = _fjsp_mul_v2r8(minushalf,_fjsp_madd_v2r8(fgb,r00,vgb));
+            dvdasum          = _fjsp_add_v2r8(dvdasum,dvdatmp);
+            gmx_fjsp_increment_1real_v2r8(dvda+jnrA,_fjsp_mul_v2r8(dvdatmp,_fjsp_mul_v2r8(isaj0,isaj0)));
+            velec            = _fjsp_mul_v2r8(qq00,rinv00);
+            felec            = _fjsp_mul_v2r8(_fjsp_msub_v2r8(velec,rinv00,fgb),rinv00);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            gmx_fjsp_decrement_fma_1rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fscal,dx00,dy00,dz00);
+
+            /* Inner loop uses 59 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_fjsp_update_iforce_1atom_swizzle_v2r8(fix0,fiy0,fiz0,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        dvdasum = _fjsp_mul_v2r8(dvdasum, _fjsp_mul_v2r8(isai0,isai0));
+        gmx_fjsp_update_1pot_v2r8(dvdasum,dvda+inr);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 7 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_F,outeriter*7 + inneriter*59);
+}
diff --git a/src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecNone_VdwCSTab_GeomP1P1_sparc64_hpc_ace_double.c b/src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecNone_VdwCSTab_GeomP1P1_sparc64_hpc_ace_double.c
new file mode 100644 (file)
index 0000000..9a0b84a
--- /dev/null
@@ -0,0 +1,632 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2012, by the GROMACS development team, led by
+ * David van der Spoel, Berk Hess, Erik Lindahl, and including many
+ * others, as listed in the AUTHORS file in the top-level source
+ * directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*
+ * Note: this file was generated by the GROMACS sparc64_hpc_ace_double kernel generator.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "kernelutil_sparc64_hpc_ace_double.h"
+
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecNone_VdwCSTab_GeomP1P1_VF_sparc64_hpc_ace_double
+ * Electrostatics interaction: None
+ * VdW interaction:            CubicSplineTable
+ * Geometry:                   Particle-Particle
+ * Calculate force/pot:        PotentialAndForce
+ */
+void
+nb_kernel_ElecNone_VdwCSTab_GeomP1P1_VF_sparc64_hpc_ace_double
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwjidx0A,vdwjidx0B;
+    _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    int              nvdwtype;
+    _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
+    _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
+    _fjsp_v2r8       rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF,twovfeps;
+    real             *vftab;
+    _fjsp_v2r8       itab_tmp;
+    _fjsp_v2r8       dummy_mask,cutoff_mask;
+    _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+    _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+    union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+
+    vftab            = kernel_data->table_vdw->data;
+    vftabscale       = gmx_fjsp_set1_v2r8(kernel_data->table_vdw->scale);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_fjsp_load_shift_and_1rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,&ix0,&iy0,&iz0);
+
+        fix0             = _fjsp_setzero_v2r8();
+        fiy0             = _fjsp_setzero_v2r8();
+        fiz0             = _fjsp_setzero_v2r8();
+
+        /* Load parameters for i particles */
+        vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+        /* Reset potential sums */
+        vvdwsum          = _fjsp_setzero_v2r8();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+
+            /* Load parameters for j particles */
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            gmx_fjsp_load_2pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,&c6_00,&c12_00);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r00,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 8;
+            vfconv.i[1]     *= 8;
+
+            /* CUBIC SPLINE TABLE DISPERSION */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 2 );
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 2 );
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            vvdw6            = _fjsp_mul_v2r8(c6_00,VV);
+            FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+            fvdw6            = _fjsp_mul_v2r8(c6_00,FF);
+
+            /* CUBIC SPLINE TABLE REPULSION */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 4 );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 4 );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 6 );
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 6 );
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            vvdw12           = _fjsp_mul_v2r8(c12_00,VV);
+            FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+            fvdw12           = _fjsp_mul_v2r8(c12_00,FF);
+            vvdw             = _fjsp_add_v2r8(vvdw12,vvdw6);
+            fvdw             = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_add_v2r8(fvdw6,fvdw12),_fjsp_mul_v2r8(vftabscale,rinv00)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
+
+            fscal            = fvdw;
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            gmx_fjsp_decrement_fma_1rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fscal,dx00,dy00,dz00);
+
+            /* Inner loop uses 59 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+
+            /* Load parameters for j particles */
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            gmx_fjsp_load_1pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,&c6_00,&c12_00);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r00,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 8;
+            vfconv.i[1]     *= 8;
+
+            /* CUBIC SPLINE TABLE DISPERSION */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 2 );
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            vvdw6            = _fjsp_mul_v2r8(c6_00,VV);
+            FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+            fvdw6            = _fjsp_mul_v2r8(c6_00,FF);
+
+            /* CUBIC SPLINE TABLE REPULSION */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 4 );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 6 );
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            vvdw12           = _fjsp_mul_v2r8(c12_00,VV);
+            FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+            fvdw12           = _fjsp_mul_v2r8(c12_00,FF);
+            vvdw             = _fjsp_add_v2r8(vvdw12,vvdw6);
+            fvdw             = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_add_v2r8(fvdw6,fvdw12),_fjsp_mul_v2r8(vftabscale,rinv00)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            vvdw             = _fjsp_unpacklo_v2r8(vvdw,_fjsp_setzero_v2r8());
+            vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
+
+            fscal            = fvdw;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            gmx_fjsp_decrement_fma_1rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fscal,dx00,dy00,dz00);
+
+            /* Inner loop uses 59 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_fjsp_update_iforce_1atom_swizzle_v2r8(fix0,fiy0,fiz0,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        ggid                        = gid[iidx];
+        /* Update potential energies */
+        gmx_fjsp_update_1pot_v2r8(vvdwsum,kernel_data->energygrp_vdw+ggid);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 7 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_VDW_VF,outeriter*7 + inneriter*59);
+}
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecNone_VdwCSTab_GeomP1P1_F_sparc64_hpc_ace_double
+ * Electrostatics interaction: None
+ * VdW interaction:            CubicSplineTable
+ * Geometry:                   Particle-Particle
+ * Calculate force/pot:        Force
+ */
+void
+nb_kernel_ElecNone_VdwCSTab_GeomP1P1_F_sparc64_hpc_ace_double
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwjidx0A,vdwjidx0B;
+    _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    int              nvdwtype;
+    _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
+    _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
+    _fjsp_v2r8       rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF,twovfeps;
+    real             *vftab;
+    _fjsp_v2r8       itab_tmp;
+    _fjsp_v2r8       dummy_mask,cutoff_mask;
+    _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+    _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+    union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+
+    vftab            = kernel_data->table_vdw->data;
+    vftabscale       = gmx_fjsp_set1_v2r8(kernel_data->table_vdw->scale);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_fjsp_load_shift_and_1rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,&ix0,&iy0,&iz0);
+
+        fix0             = _fjsp_setzero_v2r8();
+        fiy0             = _fjsp_setzero_v2r8();
+        fiz0             = _fjsp_setzero_v2r8();
+
+        /* Load parameters for i particles */
+        vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+
+            /* Load parameters for j particles */
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            gmx_fjsp_load_2pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,&c6_00,&c12_00);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r00,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 8;
+            vfconv.i[1]     *= 8;
+
+            /* CUBIC SPLINE TABLE DISPERSION */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 2 );
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 2 );
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+            FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+            fvdw6            = _fjsp_mul_v2r8(c6_00,FF);
+
+            /* CUBIC SPLINE TABLE REPULSION */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 4 );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 4 );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 6 );
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 6 );
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+            FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+            fvdw12           = _fjsp_mul_v2r8(c12_00,FF);
+            fvdw             = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_add_v2r8(fvdw6,fvdw12),_fjsp_mul_v2r8(vftabscale,rinv00)));
+
+            fscal            = fvdw;
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            gmx_fjsp_decrement_fma_1rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fscal,dx00,dy00,dz00);
+
+            /* Inner loop uses 51 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+
+            /* Load parameters for j particles */
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            gmx_fjsp_load_1pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,&c6_00,&c12_00);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r00,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 8;
+            vfconv.i[1]     *= 8;
+
+            /* CUBIC SPLINE TABLE DISPERSION */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 2 );
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+            FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+            fvdw6            = _fjsp_mul_v2r8(c6_00,FF);
+
+            /* CUBIC SPLINE TABLE REPULSION */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 4 );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 6 );
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+            FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+            fvdw12           = _fjsp_mul_v2r8(c12_00,FF);
+            fvdw             = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_add_v2r8(fvdw6,fvdw12),_fjsp_mul_v2r8(vftabscale,rinv00)));
+
+            fscal            = fvdw;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            gmx_fjsp_decrement_fma_1rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fscal,dx00,dy00,dz00);
+
+            /* Inner loop uses 51 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_fjsp_update_iforce_1atom_swizzle_v2r8(fix0,fiy0,fiz0,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 6 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_VDW_F,outeriter*6 + inneriter*51);
+}
diff --git a/src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecNone_VdwLJSh_GeomP1P1_sparc64_hpc_ace_double.c b/src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecNone_VdwLJSh_GeomP1P1_sparc64_hpc_ace_double.c
new file mode 100644 (file)
index 0000000..7bd48ad
--- /dev/null
@@ -0,0 +1,552 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2012, by the GROMACS development team, led by
+ * David van der Spoel, Berk Hess, Erik Lindahl, and including many
+ * others, as listed in the AUTHORS file in the top-level source
+ * directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*
+ * Note: this file was generated by the GROMACS sparc64_hpc_ace_double kernel generator.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "kernelutil_sparc64_hpc_ace_double.h"
+
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecNone_VdwLJSh_GeomP1P1_VF_sparc64_hpc_ace_double
+ * Electrostatics interaction: None
+ * VdW interaction:            LennardJones
+ * Geometry:                   Particle-Particle
+ * Calculate force/pot:        PotentialAndForce
+ */
+void
+nb_kernel_ElecNone_VdwLJSh_GeomP1P1_VF_sparc64_hpc_ace_double
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwjidx0A,vdwjidx0B;
+    _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    int              nvdwtype;
+    _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
+    _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
+    _fjsp_v2r8       itab_tmp;
+    _fjsp_v2r8       dummy_mask,cutoff_mask;
+    _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+    _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+    union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+
+    rcutoff_scalar   = fr->rvdw;
+    rcutoff          = gmx_fjsp_set1_v2r8(rcutoff_scalar);
+    rcutoff2         = _fjsp_mul_v2r8(rcutoff,rcutoff);
+
+    sh_vdw_invrcut6  = gmx_fjsp_set1_v2r8(fr->ic->sh_invrc6);
+    rvdw             = gmx_fjsp_set1_v2r8(fr->rvdw);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_fjsp_load_shift_and_1rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,&ix0,&iy0,&iz0);
+
+        fix0             = _fjsp_setzero_v2r8();
+        fiy0             = _fjsp_setzero_v2r8();
+        fiz0             = _fjsp_setzero_v2r8();
+
+        /* Load parameters for i particles */
+        vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+        /* Reset potential sums */
+        vvdwsum          = _fjsp_setzero_v2r8();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+
+            rinvsq00         = gmx_fjsp_inv_v2r8(rsq00);
+
+            /* Load parameters for j particles */
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
+            {
+
+            /* Compute parameters for interactions between i and j atoms */
+            gmx_fjsp_load_2pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,&c6_00,&c12_00);
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+            vvdw6            = _fjsp_mul_v2r8(c6_00,rinvsix);
+            vvdw12           = _fjsp_mul_v2r8(c12_00,_fjsp_mul_v2r8(rinvsix,rinvsix));
+            vvdw             = _fjsp_msub_v2r8(_fjsp_nmsub_v2r8(c12_00,_fjsp_mul_v2r8(sh_vdw_invrcut6,sh_vdw_invrcut6),vvdw12),one_twelfth,
+                                           _fjsp_mul_v2r8(_fjsp_nmsub_v2r8( c6_00,sh_vdw_invrcut6,vvdw6),one_sixth));
+            fvdw             = _fjsp_mul_v2r8(_fjsp_sub_v2r8(vvdw12,vvdw6),rinvsq00);
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            vvdw             = _fjsp_and_v2r8(vvdw,cutoff_mask);
+            vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
+
+            fscal            = fvdw;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            gmx_fjsp_decrement_fma_1rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fscal,dx00,dy00,dz00);
+
+            }
+
+            /* Inner loop uses 44 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+
+            rinvsq00         = gmx_fjsp_inv_v2r8(rsq00);
+
+            /* Load parameters for j particles */
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
+            {
+
+            /* Compute parameters for interactions between i and j atoms */
+            gmx_fjsp_load_1pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,&c6_00,&c12_00);
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+            vvdw6            = _fjsp_mul_v2r8(c6_00,rinvsix);
+            vvdw12           = _fjsp_mul_v2r8(c12_00,_fjsp_mul_v2r8(rinvsix,rinvsix));
+            vvdw             = _fjsp_msub_v2r8(_fjsp_nmsub_v2r8(c12_00,_fjsp_mul_v2r8(sh_vdw_invrcut6,sh_vdw_invrcut6),vvdw12),one_twelfth,
+                                           _fjsp_mul_v2r8(_fjsp_nmsub_v2r8( c6_00,sh_vdw_invrcut6,vvdw6),one_sixth));
+            fvdw             = _fjsp_mul_v2r8(_fjsp_sub_v2r8(vvdw12,vvdw6),rinvsq00);
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            vvdw             = _fjsp_and_v2r8(vvdw,cutoff_mask);
+            vvdw             = _fjsp_unpacklo_v2r8(vvdw,_fjsp_setzero_v2r8());
+            vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
+
+            fscal            = fvdw;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            gmx_fjsp_decrement_fma_1rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fscal,dx00,dy00,dz00);
+
+            }
+
+            /* Inner loop uses 44 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_fjsp_update_iforce_1atom_swizzle_v2r8(fix0,fiy0,fiz0,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        ggid                        = gid[iidx];
+        /* Update potential energies */
+        gmx_fjsp_update_1pot_v2r8(vvdwsum,kernel_data->energygrp_vdw+ggid);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 7 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_VDW_VF,outeriter*7 + inneriter*44);
+}
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecNone_VdwLJSh_GeomP1P1_F_sparc64_hpc_ace_double
+ * Electrostatics interaction: None
+ * VdW interaction:            LennardJones
+ * Geometry:                   Particle-Particle
+ * Calculate force/pot:        Force
+ */
+void
+nb_kernel_ElecNone_VdwLJSh_GeomP1P1_F_sparc64_hpc_ace_double
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwjidx0A,vdwjidx0B;
+    _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    int              nvdwtype;
+    _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
+    _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
+    _fjsp_v2r8       itab_tmp;
+    _fjsp_v2r8       dummy_mask,cutoff_mask;
+    _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+    _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+    union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+
+    rcutoff_scalar   = fr->rvdw;
+    rcutoff          = gmx_fjsp_set1_v2r8(rcutoff_scalar);
+    rcutoff2         = _fjsp_mul_v2r8(rcutoff,rcutoff);
+
+    sh_vdw_invrcut6  = gmx_fjsp_set1_v2r8(fr->ic->sh_invrc6);
+    rvdw             = gmx_fjsp_set1_v2r8(fr->rvdw);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_fjsp_load_shift_and_1rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,&ix0,&iy0,&iz0);
+
+        fix0             = _fjsp_setzero_v2r8();
+        fiy0             = _fjsp_setzero_v2r8();
+        fiz0             = _fjsp_setzero_v2r8();
+
+        /* Load parameters for i particles */
+        vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+
+            rinvsq00         = gmx_fjsp_inv_v2r8(rsq00);
+
+            /* Load parameters for j particles */
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
+            {
+
+            /* Compute parameters for interactions between i and j atoms */
+            gmx_fjsp_load_2pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,&c6_00,&c12_00);
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+            fvdw             = _fjsp_mul_v2r8(_fjsp_msub_v2r8(c12_00,rinvsix,c6_00),_fjsp_mul_v2r8(rinvsix,rinvsq00));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
+
+            fscal            = fvdw;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            gmx_fjsp_decrement_fma_1rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fscal,dx00,dy00,dz00);
+
+            }
+
+            /* Inner loop uses 33 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+
+            rinvsq00         = gmx_fjsp_inv_v2r8(rsq00);
+
+            /* Load parameters for j particles */
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
+            {
+
+            /* Compute parameters for interactions between i and j atoms */
+            gmx_fjsp_load_1pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,&c6_00,&c12_00);
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+            fvdw             = _fjsp_mul_v2r8(_fjsp_msub_v2r8(c12_00,rinvsix,c6_00),_fjsp_mul_v2r8(rinvsix,rinvsq00));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
+
+            fscal            = fvdw;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            gmx_fjsp_decrement_fma_1rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fscal,dx00,dy00,dz00);
+
+            }
+
+            /* Inner loop uses 33 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_fjsp_update_iforce_1atom_swizzle_v2r8(fix0,fiy0,fiz0,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 6 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_VDW_F,outeriter*6 + inneriter*33);
+}
diff --git a/src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecNone_VdwLJSw_GeomP1P1_sparc64_hpc_ace_double.c b/src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecNone_VdwLJSw_GeomP1P1_sparc64_hpc_ace_double.c
new file mode 100644 (file)
index 0000000..efe5597
--- /dev/null
@@ -0,0 +1,636 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2012, by the GROMACS development team, led by
+ * David van der Spoel, Berk Hess, Erik Lindahl, and including many
+ * others, as listed in the AUTHORS file in the top-level source
+ * directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*
+ * Note: this file was generated by the GROMACS sparc64_hpc_ace_double kernel generator.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "kernelutil_sparc64_hpc_ace_double.h"
+
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecNone_VdwLJSw_GeomP1P1_VF_sparc64_hpc_ace_double
+ * Electrostatics interaction: None
+ * VdW interaction:            LennardJones
+ * Geometry:                   Particle-Particle
+ * Calculate force/pot:        PotentialAndForce
+ */
+void
+nb_kernel_ElecNone_VdwLJSw_GeomP1P1_VF_sparc64_hpc_ace_double
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwjidx0A,vdwjidx0B;
+    _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    int              nvdwtype;
+    _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
+    _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
+    _fjsp_v2r8       rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
+    real             rswitch_scalar,d_scalar;
+    _fjsp_v2r8       itab_tmp;
+    _fjsp_v2r8       dummy_mask,cutoff_mask;
+    _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+    _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+    union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+
+    rcutoff_scalar   = fr->rvdw;
+    rcutoff          = gmx_fjsp_set1_v2r8(rcutoff_scalar);
+    rcutoff2         = _fjsp_mul_v2r8(rcutoff,rcutoff);
+
+    rswitch_scalar   = fr->rvdw_switch;
+    rswitch          = gmx_fjsp_set1_v2r8(rswitch_scalar);
+    /* Setup switch parameters */
+    d_scalar         = rcutoff_scalar-rswitch_scalar;
+    d                = gmx_fjsp_set1_v2r8(d_scalar);
+    swV3             = gmx_fjsp_set1_v2r8(-10.0/(d_scalar*d_scalar*d_scalar));
+    swV4             = gmx_fjsp_set1_v2r8( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar));
+    swV5             = gmx_fjsp_set1_v2r8( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
+    swF2             = gmx_fjsp_set1_v2r8(-30.0/(d_scalar*d_scalar*d_scalar));
+    swF3             = gmx_fjsp_set1_v2r8( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar));
+    swF4             = gmx_fjsp_set1_v2r8(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_fjsp_load_shift_and_1rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,&ix0,&iy0,&iz0);
+
+        fix0             = _fjsp_setzero_v2r8();
+        fiy0             = _fjsp_setzero_v2r8();
+        fiz0             = _fjsp_setzero_v2r8();
+
+        /* Load parameters for i particles */
+        vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+        /* Reset potential sums */
+        vvdwsum          = _fjsp_setzero_v2r8();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+
+            /* Load parameters for j particles */
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
+            {
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            gmx_fjsp_load_2pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,&c6_00,&c12_00);
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+            vvdw6            = _fjsp_mul_v2r8(c6_00,rinvsix);
+            vvdw12           = _fjsp_mul_v2r8(c12_00,_fjsp_mul_v2r8(rinvsix,rinvsix));
+            vvdw             = _fjsp_msub_v2r8( vvdw12,one_twelfth, _fjsp_mul_v2r8(vvdw6,one_sixth) );
+            fvdw             = _fjsp_mul_v2r8(_fjsp_sub_v2r8(vvdw12,vvdw6),rinvsq00);
+
+            d                = _fjsp_sub_v2r8(r00,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            fvdw             = _fjsp_msub_v2r8( fvdw,sw , _fjsp_mul_v2r8(rinv00,_fjsp_mul_v2r8(vvdw,dsw)) );
+            vvdw             = _fjsp_mul_v2r8(vvdw,sw);
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            vvdw             = _fjsp_and_v2r8(vvdw,cutoff_mask);
+            vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
+
+            fscal            = fvdw;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            gmx_fjsp_decrement_fma_1rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fscal,dx00,dy00,dz00);
+
+            }
+
+            /* Inner loop uses 62 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+
+            /* Load parameters for j particles */
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
+            {
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            gmx_fjsp_load_1pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,&c6_00,&c12_00);
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+            vvdw6            = _fjsp_mul_v2r8(c6_00,rinvsix);
+            vvdw12           = _fjsp_mul_v2r8(c12_00,_fjsp_mul_v2r8(rinvsix,rinvsix));
+            vvdw             = _fjsp_msub_v2r8( vvdw12,one_twelfth, _fjsp_mul_v2r8(vvdw6,one_sixth) );
+            fvdw             = _fjsp_mul_v2r8(_fjsp_sub_v2r8(vvdw12,vvdw6),rinvsq00);
+
+            d                = _fjsp_sub_v2r8(r00,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            fvdw             = _fjsp_msub_v2r8( fvdw,sw , _fjsp_mul_v2r8(rinv00,_fjsp_mul_v2r8(vvdw,dsw)) );
+            vvdw             = _fjsp_mul_v2r8(vvdw,sw);
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            vvdw             = _fjsp_and_v2r8(vvdw,cutoff_mask);
+            vvdw             = _fjsp_unpacklo_v2r8(vvdw,_fjsp_setzero_v2r8());
+            vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
+
+            fscal            = fvdw;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            gmx_fjsp_decrement_fma_1rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fscal,dx00,dy00,dz00);
+
+            }
+
+            /* Inner loop uses 62 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_fjsp_update_iforce_1atom_swizzle_v2r8(fix0,fiy0,fiz0,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        ggid                        = gid[iidx];
+        /* Update potential energies */
+        gmx_fjsp_update_1pot_v2r8(vvdwsum,kernel_data->energygrp_vdw+ggid);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 7 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_VDW_VF,outeriter*7 + inneriter*62);
+}
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecNone_VdwLJSw_GeomP1P1_F_sparc64_hpc_ace_double
+ * Electrostatics interaction: None
+ * VdW interaction:            LennardJones
+ * Geometry:                   Particle-Particle
+ * Calculate force/pot:        Force
+ */
+void
+nb_kernel_ElecNone_VdwLJSw_GeomP1P1_F_sparc64_hpc_ace_double
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwjidx0A,vdwjidx0B;
+    _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    int              nvdwtype;
+    _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
+    _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
+    _fjsp_v2r8       rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
+    real             rswitch_scalar,d_scalar;
+    _fjsp_v2r8       itab_tmp;
+    _fjsp_v2r8       dummy_mask,cutoff_mask;
+    _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+    _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+    union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+
+    rcutoff_scalar   = fr->rvdw;
+    rcutoff          = gmx_fjsp_set1_v2r8(rcutoff_scalar);
+    rcutoff2         = _fjsp_mul_v2r8(rcutoff,rcutoff);
+
+    rswitch_scalar   = fr->rvdw_switch;
+    rswitch          = gmx_fjsp_set1_v2r8(rswitch_scalar);
+    /* Setup switch parameters */
+    d_scalar         = rcutoff_scalar-rswitch_scalar;
+    d                = gmx_fjsp_set1_v2r8(d_scalar);
+    swV3             = gmx_fjsp_set1_v2r8(-10.0/(d_scalar*d_scalar*d_scalar));
+    swV4             = gmx_fjsp_set1_v2r8( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar));
+    swV5             = gmx_fjsp_set1_v2r8( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
+    swF2             = gmx_fjsp_set1_v2r8(-30.0/(d_scalar*d_scalar*d_scalar));
+    swF3             = gmx_fjsp_set1_v2r8( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar));
+    swF4             = gmx_fjsp_set1_v2r8(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_fjsp_load_shift_and_1rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,&ix0,&iy0,&iz0);
+
+        fix0             = _fjsp_setzero_v2r8();
+        fiy0             = _fjsp_setzero_v2r8();
+        fiz0             = _fjsp_setzero_v2r8();
+
+        /* Load parameters for i particles */
+        vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+
+            /* Load parameters for j particles */
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
+            {
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            gmx_fjsp_load_2pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,&c6_00,&c12_00);
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+            vvdw6            = _fjsp_mul_v2r8(c6_00,rinvsix);
+            vvdw12           = _fjsp_mul_v2r8(c12_00,_fjsp_mul_v2r8(rinvsix,rinvsix));
+            vvdw             = _fjsp_msub_v2r8( vvdw12,one_twelfth, _fjsp_mul_v2r8(vvdw6,one_sixth) );
+            fvdw             = _fjsp_mul_v2r8(_fjsp_sub_v2r8(vvdw12,vvdw6),rinvsq00);
+
+            d                = _fjsp_sub_v2r8(r00,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            fvdw             = _fjsp_msub_v2r8( fvdw,sw , _fjsp_mul_v2r8(rinv00,_fjsp_mul_v2r8(vvdw,dsw)) );
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
+
+            fscal            = fvdw;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            gmx_fjsp_decrement_fma_1rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fscal,dx00,dy00,dz00);
+
+            }
+
+            /* Inner loop uses 59 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+
+            /* Load parameters for j particles */
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
+            {
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            gmx_fjsp_load_1pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,&c6_00,&c12_00);
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+            vvdw6            = _fjsp_mul_v2r8(c6_00,rinvsix);
+            vvdw12           = _fjsp_mul_v2r8(c12_00,_fjsp_mul_v2r8(rinvsix,rinvsix));
+            vvdw             = _fjsp_msub_v2r8( vvdw12,one_twelfth, _fjsp_mul_v2r8(vvdw6,one_sixth) );
+            fvdw             = _fjsp_mul_v2r8(_fjsp_sub_v2r8(vvdw12,vvdw6),rinvsq00);
+
+            d                = _fjsp_sub_v2r8(r00,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            fvdw             = _fjsp_msub_v2r8( fvdw,sw , _fjsp_mul_v2r8(rinv00,_fjsp_mul_v2r8(vvdw,dsw)) );
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
+
+            fscal            = fvdw;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            gmx_fjsp_decrement_fma_1rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fscal,dx00,dy00,dz00);
+
+            }
+
+            /* Inner loop uses 59 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_fjsp_update_iforce_1atom_swizzle_v2r8(fix0,fiy0,fiz0,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 6 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_VDW_F,outeriter*6 + inneriter*59);
+}
diff --git a/src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecNone_VdwLJ_GeomP1P1_sparc64_hpc_ace_double.c b/src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecNone_VdwLJ_GeomP1P1_sparc64_hpc_ace_double.c
new file mode 100644 (file)
index 0000000..8a22af6
--- /dev/null
@@ -0,0 +1,498 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2012, by the GROMACS development team, led by
+ * David van der Spoel, Berk Hess, Erik Lindahl, and including many
+ * others, as listed in the AUTHORS file in the top-level source
+ * directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*
+ * Note: this file was generated by the GROMACS sparc64_hpc_ace_double kernel generator.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "kernelutil_sparc64_hpc_ace_double.h"
+
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecNone_VdwLJ_GeomP1P1_VF_sparc64_hpc_ace_double
+ * Electrostatics interaction: None
+ * VdW interaction:            LennardJones
+ * Geometry:                   Particle-Particle
+ * Calculate force/pot:        PotentialAndForce
+ */
+void
+nb_kernel_ElecNone_VdwLJ_GeomP1P1_VF_sparc64_hpc_ace_double
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwjidx0A,vdwjidx0B;
+    _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    int              nvdwtype;
+    _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
+    _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
+    _fjsp_v2r8       itab_tmp;
+    _fjsp_v2r8       dummy_mask,cutoff_mask;
+    _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+    _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+    union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_fjsp_load_shift_and_1rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,&ix0,&iy0,&iz0);
+
+        fix0             = _fjsp_setzero_v2r8();
+        fiy0             = _fjsp_setzero_v2r8();
+        fiz0             = _fjsp_setzero_v2r8();
+
+        /* Load parameters for i particles */
+        vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+        /* Reset potential sums */
+        vvdwsum          = _fjsp_setzero_v2r8();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+
+            rinvsq00         = gmx_fjsp_inv_v2r8(rsq00);
+
+            /* Load parameters for j particles */
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            gmx_fjsp_load_2pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,&c6_00,&c12_00);
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+            vvdw6            = _fjsp_mul_v2r8(c6_00,rinvsix);
+            vvdw12           = _fjsp_mul_v2r8(c12_00,_fjsp_mul_v2r8(rinvsix,rinvsix));
+            vvdw             = _fjsp_msub_v2r8( vvdw12,one_twelfth, _fjsp_mul_v2r8(vvdw6,one_sixth) );
+            fvdw             = _fjsp_mul_v2r8(_fjsp_sub_v2r8(vvdw12,vvdw6),rinvsq00);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
+
+            fscal            = fvdw;
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            gmx_fjsp_decrement_fma_1rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fscal,dx00,dy00,dz00);
+
+            /* Inner loop uses 35 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+
+            rinvsq00         = gmx_fjsp_inv_v2r8(rsq00);
+
+            /* Load parameters for j particles */
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            gmx_fjsp_load_1pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,&c6_00,&c12_00);
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+            vvdw6            = _fjsp_mul_v2r8(c6_00,rinvsix);
+            vvdw12           = _fjsp_mul_v2r8(c12_00,_fjsp_mul_v2r8(rinvsix,rinvsix));
+            vvdw             = _fjsp_msub_v2r8( vvdw12,one_twelfth, _fjsp_mul_v2r8(vvdw6,one_sixth) );
+            fvdw             = _fjsp_mul_v2r8(_fjsp_sub_v2r8(vvdw12,vvdw6),rinvsq00);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            vvdw             = _fjsp_unpacklo_v2r8(vvdw,_fjsp_setzero_v2r8());
+            vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
+
+            fscal            = fvdw;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            gmx_fjsp_decrement_fma_1rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fscal,dx00,dy00,dz00);
+
+            /* Inner loop uses 35 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_fjsp_update_iforce_1atom_swizzle_v2r8(fix0,fiy0,fiz0,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        ggid                        = gid[iidx];
+        /* Update potential energies */
+        gmx_fjsp_update_1pot_v2r8(vvdwsum,kernel_data->energygrp_vdw+ggid);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 7 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_VDW_VF,outeriter*7 + inneriter*35);
+}
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecNone_VdwLJ_GeomP1P1_F_sparc64_hpc_ace_double
+ * Electrostatics interaction: None
+ * VdW interaction:            LennardJones
+ * Geometry:                   Particle-Particle
+ * Calculate force/pot:        Force
+ */
+void
+nb_kernel_ElecNone_VdwLJ_GeomP1P1_F_sparc64_hpc_ace_double
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwjidx0A,vdwjidx0B;
+    _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    int              nvdwtype;
+    _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
+    _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
+    _fjsp_v2r8       itab_tmp;
+    _fjsp_v2r8       dummy_mask,cutoff_mask;
+    _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+    _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+    union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_fjsp_load_shift_and_1rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,&ix0,&iy0,&iz0);
+
+        fix0             = _fjsp_setzero_v2r8();
+        fiy0             = _fjsp_setzero_v2r8();
+        fiz0             = _fjsp_setzero_v2r8();
+
+        /* Load parameters for i particles */
+        vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+
+            rinvsq00         = gmx_fjsp_inv_v2r8(rsq00);
+
+            /* Load parameters for j particles */
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            gmx_fjsp_load_2pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,&c6_00,&c12_00);
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+            fvdw             = _fjsp_mul_v2r8(_fjsp_msub_v2r8(c12_00,rinvsix,c6_00),_fjsp_mul_v2r8(rinvsix,rinvsq00));
+
+            fscal            = fvdw;
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            gmx_fjsp_decrement_fma_1rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fscal,dx00,dy00,dz00);
+
+            /* Inner loop uses 30 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+
+            rinvsq00         = gmx_fjsp_inv_v2r8(rsq00);
+
+            /* Load parameters for j particles */
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            gmx_fjsp_load_1pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,&c6_00,&c12_00);
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+            fvdw             = _fjsp_mul_v2r8(_fjsp_msub_v2r8(c12_00,rinvsix,c6_00),_fjsp_mul_v2r8(rinvsix,rinvsq00));
+
+            fscal            = fvdw;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            gmx_fjsp_decrement_fma_1rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fscal,dx00,dy00,dz00);
+
+            /* Inner loop uses 30 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_fjsp_update_iforce_1atom_swizzle_v2r8(fix0,fiy0,fiz0,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 6 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_VDW_F,outeriter*6 + inneriter*30);
+}
diff --git a/src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecRFCut_VdwCSTab_GeomP1P1_sparc64_hpc_ace_double.c b/src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecRFCut_VdwCSTab_GeomP1P1_sparc64_hpc_ace_double.c
new file mode 100644 (file)
index 0000000..d1895e9
--- /dev/null
@@ -0,0 +1,733 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2012, by the GROMACS development team, led by
+ * David van der Spoel, Berk Hess, Erik Lindahl, and including many
+ * others, as listed in the AUTHORS file in the top-level source
+ * directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*
+ * Note: this file was generated by the GROMACS sparc64_hpc_ace_double kernel generator.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "kernelutil_sparc64_hpc_ace_double.h"
+
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecRFCut_VdwCSTab_GeomP1P1_VF_sparc64_hpc_ace_double
+ * Electrostatics interaction: ReactionField
+ * VdW interaction:            CubicSplineTable
+ * Geometry:                   Particle-Particle
+ * Calculate force/pot:        PotentialAndForce
+ */
+void
+nb_kernel_ElecRFCut_VdwCSTab_GeomP1P1_VF_sparc64_hpc_ace_double
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwjidx0A,vdwjidx0B;
+    _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
+    _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
+    _fjsp_v2r8       rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF,twovfeps;
+    real             *vftab;
+    _fjsp_v2r8       itab_tmp;
+    _fjsp_v2r8       dummy_mask,cutoff_mask;
+    _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+    _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+    union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    krf              = gmx_fjsp_set1_v2r8(fr->ic->k_rf);
+    krf2             = gmx_fjsp_set1_v2r8(fr->ic->k_rf*2.0);
+    crf              = gmx_fjsp_set1_v2r8(fr->ic->c_rf);
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+
+    vftab            = kernel_data->table_vdw->data;
+    vftabscale       = gmx_fjsp_set1_v2r8(kernel_data->table_vdw->scale);
+
+    /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+    rcutoff_scalar   = fr->rcoulomb;
+    rcutoff          = gmx_fjsp_set1_v2r8(rcutoff_scalar);
+    rcutoff2         = _fjsp_mul_v2r8(rcutoff,rcutoff);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_fjsp_load_shift_and_1rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,&ix0,&iy0,&iz0);
+
+        fix0             = _fjsp_setzero_v2r8();
+        fiy0             = _fjsp_setzero_v2r8();
+        fiz0             = _fjsp_setzero_v2r8();
+
+        /* Load parameters for i particles */
+        iq0              = _fjsp_mul_v2r8(facel,gmx_fjsp_load1_v2r8(charge+inr+0));
+        vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+        /* Reset potential sums */
+        velecsum         = _fjsp_setzero_v2r8();
+        vvdwsum          = _fjsp_setzero_v2r8();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_fjsp_load_2real_swizzle_v2r8(charge+jnrA+0,charge+jnrB+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
+            {
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _fjsp_mul_v2r8(iq0,jq0);
+            gmx_fjsp_load_2pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,&c6_00,&c12_00);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r00,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 8;
+            vfconv.i[1]     *= 8;
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq00,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq00,rinv00),crf));
+            felec            = _fjsp_mul_v2r8(qq00,_fjsp_msub_v2r8(rinv00,rinvsq00,krf2));
+
+            /* CUBIC SPLINE TABLE DISPERSION */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 2 );
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 2 );
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            vvdw6            = _fjsp_mul_v2r8(c6_00,VV);
+            FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+            fvdw6            = _fjsp_mul_v2r8(c6_00,FF);
+
+            /* CUBIC SPLINE TABLE REPULSION */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 4 );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 4 );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 6 );
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 6 );
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            vvdw12           = _fjsp_mul_v2r8(c12_00,VV);
+            FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+            fvdw12           = _fjsp_mul_v2r8(c12_00,FF);
+            vvdw             = _fjsp_add_v2r8(vvdw12,vvdw6);
+            fvdw             = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_add_v2r8(fvdw6,fvdw12),_fjsp_mul_v2r8(vftabscale,rinv00)));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+            vvdw             = _fjsp_and_v2r8(vvdw,cutoff_mask);
+            vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
+
+            fscal            = _fjsp_add_v2r8(felec,fvdw);
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            gmx_fjsp_decrement_fma_1rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fscal,dx00,dy00,dz00);
+
+            }
+
+            /* Inner loop uses 75 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+
+            /* Load parameters for j particles */
+            jq0              = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),charge+jnrA+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
+            {
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _fjsp_mul_v2r8(iq0,jq0);
+            gmx_fjsp_load_1pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,&c6_00,&c12_00);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r00,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 8;
+            vfconv.i[1]     *= 8;
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq00,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq00,rinv00),crf));
+            felec            = _fjsp_mul_v2r8(qq00,_fjsp_msub_v2r8(rinv00,rinvsq00,krf2));
+
+            /* CUBIC SPLINE TABLE DISPERSION */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 2 );
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            vvdw6            = _fjsp_mul_v2r8(c6_00,VV);
+            FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+            fvdw6            = _fjsp_mul_v2r8(c6_00,FF);
+
+            /* CUBIC SPLINE TABLE REPULSION */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 4 );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 6 );
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            vvdw12           = _fjsp_mul_v2r8(c12_00,VV);
+            FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+            fvdw12           = _fjsp_mul_v2r8(c12_00,FF);
+            vvdw             = _fjsp_add_v2r8(vvdw12,vvdw6);
+            fvdw             = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_add_v2r8(fvdw6,fvdw12),_fjsp_mul_v2r8(vftabscale,rinv00)));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+            vvdw             = _fjsp_and_v2r8(vvdw,cutoff_mask);
+            vvdw             = _fjsp_unpacklo_v2r8(vvdw,_fjsp_setzero_v2r8());
+            vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
+
+            fscal            = _fjsp_add_v2r8(felec,fvdw);
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            gmx_fjsp_decrement_fma_1rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fscal,dx00,dy00,dz00);
+
+            }
+
+            /* Inner loop uses 75 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_fjsp_update_iforce_1atom_swizzle_v2r8(fix0,fiy0,fiz0,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        ggid                        = gid[iidx];
+        /* Update potential energies */
+        gmx_fjsp_update_1pot_v2r8(velecsum,kernel_data->energygrp_elec+ggid);
+        gmx_fjsp_update_1pot_v2r8(vvdwsum,kernel_data->energygrp_vdw+ggid);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 9 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_VF,outeriter*9 + inneriter*75);
+}
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecRFCut_VdwCSTab_GeomP1P1_F_sparc64_hpc_ace_double
+ * Electrostatics interaction: ReactionField
+ * VdW interaction:            CubicSplineTable
+ * Geometry:                   Particle-Particle
+ * Calculate force/pot:        Force
+ */
+void
+nb_kernel_ElecRFCut_VdwCSTab_GeomP1P1_F_sparc64_hpc_ace_double
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwjidx0A,vdwjidx0B;
+    _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
+    _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
+    _fjsp_v2r8       rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF,twovfeps;
+    real             *vftab;
+    _fjsp_v2r8       itab_tmp;
+    _fjsp_v2r8       dummy_mask,cutoff_mask;
+    _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+    _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+    union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    krf              = gmx_fjsp_set1_v2r8(fr->ic->k_rf);
+    krf2             = gmx_fjsp_set1_v2r8(fr->ic->k_rf*2.0);
+    crf              = gmx_fjsp_set1_v2r8(fr->ic->c_rf);
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+
+    vftab            = kernel_data->table_vdw->data;
+    vftabscale       = gmx_fjsp_set1_v2r8(kernel_data->table_vdw->scale);
+
+    /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+    rcutoff_scalar   = fr->rcoulomb;
+    rcutoff          = gmx_fjsp_set1_v2r8(rcutoff_scalar);
+    rcutoff2         = _fjsp_mul_v2r8(rcutoff,rcutoff);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_fjsp_load_shift_and_1rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,&ix0,&iy0,&iz0);
+
+        fix0             = _fjsp_setzero_v2r8();
+        fiy0             = _fjsp_setzero_v2r8();
+        fiz0             = _fjsp_setzero_v2r8();
+
+        /* Load parameters for i particles */
+        iq0              = _fjsp_mul_v2r8(facel,gmx_fjsp_load1_v2r8(charge+inr+0));
+        vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_fjsp_load_2real_swizzle_v2r8(charge+jnrA+0,charge+jnrB+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
+            {
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _fjsp_mul_v2r8(iq0,jq0);
+            gmx_fjsp_load_2pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,&c6_00,&c12_00);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r00,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 8;
+            vfconv.i[1]     *= 8;
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq00,_fjsp_msub_v2r8(rinv00,rinvsq00,krf2));
+
+            /* CUBIC SPLINE TABLE DISPERSION */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 2 );
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 2 );
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+            FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+            fvdw6            = _fjsp_mul_v2r8(c6_00,FF);
+
+            /* CUBIC SPLINE TABLE REPULSION */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 4 );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 4 );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 6 );
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 6 );
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+            FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+            fvdw12           = _fjsp_mul_v2r8(c12_00,FF);
+            fvdw             = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_add_v2r8(fvdw6,fvdw12),_fjsp_mul_v2r8(vftabscale,rinv00)));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
+
+            fscal            = _fjsp_add_v2r8(felec,fvdw);
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            gmx_fjsp_decrement_fma_1rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fscal,dx00,dy00,dz00);
+
+            }
+
+            /* Inner loop uses 60 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+
+            /* Load parameters for j particles */
+            jq0              = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),charge+jnrA+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
+            {
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _fjsp_mul_v2r8(iq0,jq0);
+            gmx_fjsp_load_1pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,&c6_00,&c12_00);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r00,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 8;
+            vfconv.i[1]     *= 8;
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq00,_fjsp_msub_v2r8(rinv00,rinvsq00,krf2));
+
+            /* CUBIC SPLINE TABLE DISPERSION */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 2 );
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+            FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+            fvdw6            = _fjsp_mul_v2r8(c6_00,FF);
+
+            /* CUBIC SPLINE TABLE REPULSION */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 4 );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 6 );
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+            FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+            fvdw12           = _fjsp_mul_v2r8(c12_00,FF);
+            fvdw             = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_add_v2r8(fvdw6,fvdw12),_fjsp_mul_v2r8(vftabscale,rinv00)));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
+
+            fscal            = _fjsp_add_v2r8(felec,fvdw);
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            gmx_fjsp_decrement_fma_1rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fscal,dx00,dy00,dz00);
+
+            }
+
+            /* Inner loop uses 60 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_fjsp_update_iforce_1atom_swizzle_v2r8(fix0,fiy0,fiz0,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 7 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_F,outeriter*7 + inneriter*60);
+}
diff --git a/src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecRFCut_VdwCSTab_GeomW3P1_sparc64_hpc_ace_double.c b/src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecRFCut_VdwCSTab_GeomW3P1_sparc64_hpc_ace_double.c
new file mode 100644 (file)
index 0000000..2aa77da
--- /dev/null
@@ -0,0 +1,1115 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2012, by the GROMACS development team, led by
+ * David van der Spoel, Berk Hess, Erik Lindahl, and including many
+ * others, as listed in the AUTHORS file in the top-level source
+ * directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*
+ * Note: this file was generated by the GROMACS sparc64_hpc_ace_double kernel generator.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "kernelutil_sparc64_hpc_ace_double.h"
+
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecRFCut_VdwCSTab_GeomW3P1_VF_sparc64_hpc_ace_double
+ * Electrostatics interaction: ReactionField
+ * VdW interaction:            CubicSplineTable
+ * Geometry:                   Water3-Particle
+ * Calculate force/pot:        PotentialAndForce
+ */
+void
+nb_kernel_ElecRFCut_VdwCSTab_GeomW3P1_VF_sparc64_hpc_ace_double
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwjidx0A,vdwjidx0B;
+    _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    _fjsp_v2r8       dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    _fjsp_v2r8       dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
+    _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
+    _fjsp_v2r8       rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF,twovfeps;
+    real             *vftab;
+    _fjsp_v2r8       itab_tmp;
+    _fjsp_v2r8       dummy_mask,cutoff_mask;
+    _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+    _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+    union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    krf              = gmx_fjsp_set1_v2r8(fr->ic->k_rf);
+    krf2             = gmx_fjsp_set1_v2r8(fr->ic->k_rf*2.0);
+    crf              = gmx_fjsp_set1_v2r8(fr->ic->c_rf);
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+
+    vftab            = kernel_data->table_vdw->data;
+    vftabscale       = gmx_fjsp_set1_v2r8(kernel_data->table_vdw->scale);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq0              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+0]));
+    iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+    iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+    vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+    /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+    rcutoff_scalar   = fr->rcoulomb;
+    rcutoff          = gmx_fjsp_set1_v2r8(rcutoff_scalar);
+    rcutoff2         = _fjsp_mul_v2r8(rcutoff,rcutoff);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_fjsp_load_shift_and_3rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
+
+        fix0             = _fjsp_setzero_v2r8();
+        fiy0             = _fjsp_setzero_v2r8();
+        fiz0             = _fjsp_setzero_v2r8();
+        fix1             = _fjsp_setzero_v2r8();
+        fiy1             = _fjsp_setzero_v2r8();
+        fiz1             = _fjsp_setzero_v2r8();
+        fix2             = _fjsp_setzero_v2r8();
+        fiy2             = _fjsp_setzero_v2r8();
+        fiz2             = _fjsp_setzero_v2r8();
+
+        /* Reset potential sums */
+        velecsum         = _fjsp_setzero_v2r8();
+        vvdwsum          = _fjsp_setzero_v2r8();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+            rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+            rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_fjsp_load_2real_swizzle_v2r8(charge+jnrA+0,charge+jnrB+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
+            {
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _fjsp_mul_v2r8(iq0,jq0);
+            gmx_fjsp_load_2pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,&c6_00,&c12_00);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r00,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 8;
+            vfconv.i[1]     *= 8;
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq00,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq00,rinv00),crf));
+            felec            = _fjsp_mul_v2r8(qq00,_fjsp_msub_v2r8(rinv00,rinvsq00,krf2));
+
+            /* CUBIC SPLINE TABLE DISPERSION */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 2 );
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 2 );
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            vvdw6            = _fjsp_mul_v2r8(c6_00,VV);
+            FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+            fvdw6            = _fjsp_mul_v2r8(c6_00,FF);
+
+            /* CUBIC SPLINE TABLE REPULSION */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 4 );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 4 );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 6 );
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 6 );
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            vvdw12           = _fjsp_mul_v2r8(c12_00,VV);
+            FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+            fvdw12           = _fjsp_mul_v2r8(c12_00,FF);
+            vvdw             = _fjsp_add_v2r8(vvdw12,vvdw6);
+            fvdw             = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_add_v2r8(fvdw6,fvdw12),_fjsp_mul_v2r8(vftabscale,rinv00)));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+            vvdw             = _fjsp_and_v2r8(vvdw,cutoff_mask);
+            vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
+
+            fscal            = _fjsp_add_v2r8(felec,fvdw);
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq10,rcutoff2))
+            {
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _fjsp_mul_v2r8(iq1,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq10,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq10,rinv10),crf));
+            felec            = _fjsp_mul_v2r8(qq10,_fjsp_msub_v2r8(rinv10,rinvsq10,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq10,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq20,rcutoff2))
+            {
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _fjsp_mul_v2r8(iq2,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq20,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq20,rinv20),crf));
+            felec            = _fjsp_mul_v2r8(qq20,_fjsp_msub_v2r8(rinv20,rinvsq20,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq20,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            }
+
+            gmx_fjsp_decrement_1rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 156 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+            rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+            rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+
+            /* Load parameters for j particles */
+            jq0              = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),charge+jnrA+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
+            {
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _fjsp_mul_v2r8(iq0,jq0);
+            gmx_fjsp_load_1pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,&c6_00,&c12_00);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r00,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 8;
+            vfconv.i[1]     *= 8;
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq00,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq00,rinv00),crf));
+            felec            = _fjsp_mul_v2r8(qq00,_fjsp_msub_v2r8(rinv00,rinvsq00,krf2));
+
+            /* CUBIC SPLINE TABLE DISPERSION */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 2 );
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            vvdw6            = _fjsp_mul_v2r8(c6_00,VV);
+            FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+            fvdw6            = _fjsp_mul_v2r8(c6_00,FF);
+
+            /* CUBIC SPLINE TABLE REPULSION */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 4 );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 6 );
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            vvdw12           = _fjsp_mul_v2r8(c12_00,VV);
+            FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+            fvdw12           = _fjsp_mul_v2r8(c12_00,FF);
+            vvdw             = _fjsp_add_v2r8(vvdw12,vvdw6);
+            fvdw             = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_add_v2r8(fvdw6,fvdw12),_fjsp_mul_v2r8(vftabscale,rinv00)));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+            vvdw             = _fjsp_and_v2r8(vvdw,cutoff_mask);
+            vvdw             = _fjsp_unpacklo_v2r8(vvdw,_fjsp_setzero_v2r8());
+            vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
+
+            fscal            = _fjsp_add_v2r8(felec,fvdw);
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq10,rcutoff2))
+            {
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _fjsp_mul_v2r8(iq1,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq10,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq10,rinv10),crf));
+            felec            = _fjsp_mul_v2r8(qq10,_fjsp_msub_v2r8(rinv10,rinvsq10,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq10,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq20,rcutoff2))
+            {
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _fjsp_mul_v2r8(iq2,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq20,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq20,rinv20),crf));
+            felec            = _fjsp_mul_v2r8(qq20,_fjsp_msub_v2r8(rinv20,rinvsq20,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq20,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            }
+
+            gmx_fjsp_decrement_1rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 156 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_fjsp_update_iforce_3atom_swizzle_v2r8(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        ggid                        = gid[iidx];
+        /* Update potential energies */
+        gmx_fjsp_update_1pot_v2r8(velecsum,kernel_data->energygrp_elec+ggid);
+        gmx_fjsp_update_1pot_v2r8(vvdwsum,kernel_data->energygrp_vdw+ggid);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 20 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3_VF,outeriter*20 + inneriter*156);
+}
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecRFCut_VdwCSTab_GeomW3P1_F_sparc64_hpc_ace_double
+ * Electrostatics interaction: ReactionField
+ * VdW interaction:            CubicSplineTable
+ * Geometry:                   Water3-Particle
+ * Calculate force/pot:        Force
+ */
+void
+nb_kernel_ElecRFCut_VdwCSTab_GeomW3P1_F_sparc64_hpc_ace_double
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwjidx0A,vdwjidx0B;
+    _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    _fjsp_v2r8       dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    _fjsp_v2r8       dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
+    _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
+    _fjsp_v2r8       rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF,twovfeps;
+    real             *vftab;
+    _fjsp_v2r8       itab_tmp;
+    _fjsp_v2r8       dummy_mask,cutoff_mask;
+    _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+    _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+    union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    krf              = gmx_fjsp_set1_v2r8(fr->ic->k_rf);
+    krf2             = gmx_fjsp_set1_v2r8(fr->ic->k_rf*2.0);
+    crf              = gmx_fjsp_set1_v2r8(fr->ic->c_rf);
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+
+    vftab            = kernel_data->table_vdw->data;
+    vftabscale       = gmx_fjsp_set1_v2r8(kernel_data->table_vdw->scale);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq0              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+0]));
+    iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+    iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+    vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+    /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+    rcutoff_scalar   = fr->rcoulomb;
+    rcutoff          = gmx_fjsp_set1_v2r8(rcutoff_scalar);
+    rcutoff2         = _fjsp_mul_v2r8(rcutoff,rcutoff);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_fjsp_load_shift_and_3rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
+
+        fix0             = _fjsp_setzero_v2r8();
+        fiy0             = _fjsp_setzero_v2r8();
+        fiz0             = _fjsp_setzero_v2r8();
+        fix1             = _fjsp_setzero_v2r8();
+        fiy1             = _fjsp_setzero_v2r8();
+        fiz1             = _fjsp_setzero_v2r8();
+        fix2             = _fjsp_setzero_v2r8();
+        fiy2             = _fjsp_setzero_v2r8();
+        fiz2             = _fjsp_setzero_v2r8();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+            rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+            rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_fjsp_load_2real_swizzle_v2r8(charge+jnrA+0,charge+jnrB+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
+            {
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _fjsp_mul_v2r8(iq0,jq0);
+            gmx_fjsp_load_2pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,&c6_00,&c12_00);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r00,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 8;
+            vfconv.i[1]     *= 8;
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq00,_fjsp_msub_v2r8(rinv00,rinvsq00,krf2));
+
+            /* CUBIC SPLINE TABLE DISPERSION */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 2 );
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 2 );
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+            FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+            fvdw6            = _fjsp_mul_v2r8(c6_00,FF);
+
+            /* CUBIC SPLINE TABLE REPULSION */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 4 );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 4 );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 6 );
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 6 );
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+            FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+            fvdw12           = _fjsp_mul_v2r8(c12_00,FF);
+            fvdw             = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_add_v2r8(fvdw6,fvdw12),_fjsp_mul_v2r8(vftabscale,rinv00)));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
+
+            fscal            = _fjsp_add_v2r8(felec,fvdw);
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq10,rcutoff2))
+            {
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _fjsp_mul_v2r8(iq1,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq10,_fjsp_msub_v2r8(rinv10,rinvsq10,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq10,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq20,rcutoff2))
+            {
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _fjsp_mul_v2r8(iq2,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq20,_fjsp_msub_v2r8(rinv20,rinvsq20,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq20,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            }
+
+            gmx_fjsp_decrement_1rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 129 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+            rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+            rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+
+            /* Load parameters for j particles */
+            jq0              = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),charge+jnrA+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
+            {
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _fjsp_mul_v2r8(iq0,jq0);
+            gmx_fjsp_load_1pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,&c6_00,&c12_00);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r00,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 8;
+            vfconv.i[1]     *= 8;
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq00,_fjsp_msub_v2r8(rinv00,rinvsq00,krf2));
+
+            /* CUBIC SPLINE TABLE DISPERSION */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 2 );
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+            FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+            fvdw6            = _fjsp_mul_v2r8(c6_00,FF);
+
+            /* CUBIC SPLINE TABLE REPULSION */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 4 );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 6 );
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+            FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+            fvdw12           = _fjsp_mul_v2r8(c12_00,FF);
+            fvdw             = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_add_v2r8(fvdw6,fvdw12),_fjsp_mul_v2r8(vftabscale,rinv00)));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
+
+            fscal            = _fjsp_add_v2r8(felec,fvdw);
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq10,rcutoff2))
+            {
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _fjsp_mul_v2r8(iq1,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq10,_fjsp_msub_v2r8(rinv10,rinvsq10,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq10,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq20,rcutoff2))
+            {
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _fjsp_mul_v2r8(iq2,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq20,_fjsp_msub_v2r8(rinv20,rinvsq20,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq20,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            }
+
+            gmx_fjsp_decrement_1rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 129 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_fjsp_update_iforce_3atom_swizzle_v2r8(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 18 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3_F,outeriter*18 + inneriter*129);
+}
diff --git a/src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecRFCut_VdwCSTab_GeomW3W3_sparc64_hpc_ace_double.c b/src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecRFCut_VdwCSTab_GeomW3W3_sparc64_hpc_ace_double.c
new file mode 100644 (file)
index 0000000..58ab23f
--- /dev/null
@@ -0,0 +1,2013 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2012, by the GROMACS development team, led by
+ * David van der Spoel, Berk Hess, Erik Lindahl, and including many
+ * others, as listed in the AUTHORS file in the top-level source
+ * directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*
+ * Note: this file was generated by the GROMACS sparc64_hpc_ace_double kernel generator.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "kernelutil_sparc64_hpc_ace_double.h"
+
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecRFCut_VdwCSTab_GeomW3W3_VF_sparc64_hpc_ace_double
+ * Electrostatics interaction: ReactionField
+ * VdW interaction:            CubicSplineTable
+ * Geometry:                   Water3-Water3
+ * Calculate force/pot:        PotentialAndForce
+ */
+void
+nb_kernel_ElecRFCut_VdwCSTab_GeomW3W3_VF_sparc64_hpc_ace_double
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwjidx0A,vdwjidx0B;
+    _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    int              vdwjidx1A,vdwjidx1B;
+    _fjsp_v2r8       jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+    int              vdwjidx2A,vdwjidx2B;
+    _fjsp_v2r8       jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+    _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    _fjsp_v2r8       dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
+    _fjsp_v2r8       dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
+    _fjsp_v2r8       dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    _fjsp_v2r8       dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+    _fjsp_v2r8       dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+    _fjsp_v2r8       dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    _fjsp_v2r8       dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+    _fjsp_v2r8       dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+    _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
+    _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
+    _fjsp_v2r8       rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF,twovfeps;
+    real             *vftab;
+    _fjsp_v2r8       itab_tmp;
+    _fjsp_v2r8       dummy_mask,cutoff_mask;
+    _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+    _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+    union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    krf              = gmx_fjsp_set1_v2r8(fr->ic->k_rf);
+    krf2             = gmx_fjsp_set1_v2r8(fr->ic->k_rf*2.0);
+    crf              = gmx_fjsp_set1_v2r8(fr->ic->c_rf);
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+
+    vftab            = kernel_data->table_vdw->data;
+    vftabscale       = gmx_fjsp_set1_v2r8(kernel_data->table_vdw->scale);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq0              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+0]));
+    iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+    iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+    vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+    jq0              = gmx_fjsp_set1_v2r8(charge[inr+0]);
+    jq1              = gmx_fjsp_set1_v2r8(charge[inr+1]);
+    jq2              = gmx_fjsp_set1_v2r8(charge[inr+2]);
+    vdwjidx0A        = 2*vdwtype[inr+0];
+    qq00             = _fjsp_mul_v2r8(iq0,jq0);
+    c6_00            = gmx_fjsp_set1_v2r8(vdwparam[vdwioffset0+vdwjidx0A]);
+    c12_00           = gmx_fjsp_set1_v2r8(vdwparam[vdwioffset0+vdwjidx0A+1]);
+    qq01             = _fjsp_mul_v2r8(iq0,jq1);
+    qq02             = _fjsp_mul_v2r8(iq0,jq2);
+    qq10             = _fjsp_mul_v2r8(iq1,jq0);
+    qq11             = _fjsp_mul_v2r8(iq1,jq1);
+    qq12             = _fjsp_mul_v2r8(iq1,jq2);
+    qq20             = _fjsp_mul_v2r8(iq2,jq0);
+    qq21             = _fjsp_mul_v2r8(iq2,jq1);
+    qq22             = _fjsp_mul_v2r8(iq2,jq2);
+
+    /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+    rcutoff_scalar   = fr->rcoulomb;
+    rcutoff          = gmx_fjsp_set1_v2r8(rcutoff_scalar);
+    rcutoff2         = _fjsp_mul_v2r8(rcutoff,rcutoff);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_fjsp_load_shift_and_3rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
+
+        fix0             = _fjsp_setzero_v2r8();
+        fiy0             = _fjsp_setzero_v2r8();
+        fiz0             = _fjsp_setzero_v2r8();
+        fix1             = _fjsp_setzero_v2r8();
+        fiy1             = _fjsp_setzero_v2r8();
+        fiz1             = _fjsp_setzero_v2r8();
+        fix2             = _fjsp_setzero_v2r8();
+        fiy2             = _fjsp_setzero_v2r8();
+        fiz2             = _fjsp_setzero_v2r8();
+
+        /* Reset potential sums */
+        velecsum         = _fjsp_setzero_v2r8();
+        vvdwsum          = _fjsp_setzero_v2r8();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_3rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx01             = _fjsp_sub_v2r8(ix0,jx1);
+            dy01             = _fjsp_sub_v2r8(iy0,jy1);
+            dz01             = _fjsp_sub_v2r8(iz0,jz1);
+            dx02             = _fjsp_sub_v2r8(ix0,jx2);
+            dy02             = _fjsp_sub_v2r8(iy0,jy2);
+            dz02             = _fjsp_sub_v2r8(iz0,jz2);
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx11             = _fjsp_sub_v2r8(ix1,jx1);
+            dy11             = _fjsp_sub_v2r8(iy1,jy1);
+            dz11             = _fjsp_sub_v2r8(iz1,jz1);
+            dx12             = _fjsp_sub_v2r8(ix1,jx2);
+            dy12             = _fjsp_sub_v2r8(iy1,jy2);
+            dz12             = _fjsp_sub_v2r8(iz1,jz2);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+            dx21             = _fjsp_sub_v2r8(ix2,jx1);
+            dy21             = _fjsp_sub_v2r8(iy2,jy1);
+            dz21             = _fjsp_sub_v2r8(iz2,jz1);
+            dx22             = _fjsp_sub_v2r8(ix2,jx2);
+            dy22             = _fjsp_sub_v2r8(iy2,jy2);
+            dz22             = _fjsp_sub_v2r8(iz2,jz2);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq01            = gmx_fjsp_calc_rsq_v2r8(dx01,dy01,dz01);
+            rsq02            = gmx_fjsp_calc_rsq_v2r8(dx02,dy02,dz02);
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+            rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+            rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+            rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv01           = gmx_fjsp_invsqrt_v2r8(rsq01);
+            rinv02           = gmx_fjsp_invsqrt_v2r8(rsq02);
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+            rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+            rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+            rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+            rinvsq01         = _fjsp_mul_v2r8(rinv01,rinv01);
+            rinvsq02         = _fjsp_mul_v2r8(rinv02,rinv02);
+            rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+            rinvsq11         = _fjsp_mul_v2r8(rinv11,rinv11);
+            rinvsq12         = _fjsp_mul_v2r8(rinv12,rinv12);
+            rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+            rinvsq21         = _fjsp_mul_v2r8(rinv21,rinv21);
+            rinvsq22         = _fjsp_mul_v2r8(rinv22,rinv22);
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+            fjx1             = _fjsp_setzero_v2r8();
+            fjy1             = _fjsp_setzero_v2r8();
+            fjz1             = _fjsp_setzero_v2r8();
+            fjx2             = _fjsp_setzero_v2r8();
+            fjy2             = _fjsp_setzero_v2r8();
+            fjz2             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
+            {
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r00,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 8;
+            vfconv.i[1]     *= 8;
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq00,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq00,rinv00),crf));
+            felec            = _fjsp_mul_v2r8(qq00,_fjsp_msub_v2r8(rinv00,rinvsq00,krf2));
+
+            /* CUBIC SPLINE TABLE DISPERSION */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 2 );
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 2 );
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            vvdw6            = _fjsp_mul_v2r8(c6_00,VV);
+            FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+            fvdw6            = _fjsp_mul_v2r8(c6_00,FF);
+
+            /* CUBIC SPLINE TABLE REPULSION */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 4 );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 4 );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 6 );
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 6 );
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            vvdw12           = _fjsp_mul_v2r8(c12_00,VV);
+            FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+            fvdw12           = _fjsp_mul_v2r8(c12_00,FF);
+            vvdw             = _fjsp_add_v2r8(vvdw12,vvdw6);
+            fvdw             = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_add_v2r8(fvdw6,fvdw12),_fjsp_mul_v2r8(vftabscale,rinv00)));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+            vvdw             = _fjsp_and_v2r8(vvdw,cutoff_mask);
+            vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
+
+            fscal            = _fjsp_add_v2r8(felec,fvdw);
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq01,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq01,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq01,rinv01),crf));
+            felec            = _fjsp_mul_v2r8(qq01,_fjsp_msub_v2r8(rinv01,rinvsq01,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq01,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx01,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy01,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz01,fscal,fiz0);
+            
+            fjx1             = _fjsp_madd_v2r8(dx01,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy01,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz01,fscal,fjz1);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq02,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq02,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq02,rinv02),crf));
+            felec            = _fjsp_mul_v2r8(qq02,_fjsp_msub_v2r8(rinv02,rinvsq02,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq02,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx02,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy02,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz02,fscal,fiz0);
+            
+            fjx2             = _fjsp_madd_v2r8(dx02,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy02,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz02,fscal,fjz2);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq10,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq10,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq10,rinv10),crf));
+            felec            = _fjsp_mul_v2r8(qq10,_fjsp_msub_v2r8(rinv10,rinvsq10,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq10,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq11,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq11,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq11,rinv11),crf));
+            felec            = _fjsp_mul_v2r8(qq11,_fjsp_msub_v2r8(rinv11,rinvsq11,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq11,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+            
+            fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq12,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq12,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq12,rinv12),crf));
+            felec            = _fjsp_mul_v2r8(qq12,_fjsp_msub_v2r8(rinv12,rinvsq12,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq12,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+            
+            fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq20,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq20,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq20,rinv20),crf));
+            felec            = _fjsp_mul_v2r8(qq20,_fjsp_msub_v2r8(rinv20,rinvsq20,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq20,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq21,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq21,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq21,rinv21),crf));
+            felec            = _fjsp_mul_v2r8(qq21,_fjsp_msub_v2r8(rinv21,rinvsq21,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq21,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+            
+            fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq22,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq22,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq22,rinv22),crf));
+            felec            = _fjsp_mul_v2r8(qq22,_fjsp_msub_v2r8(rinv22,rinvsq22,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq22,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+            
+            fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+
+            }
+
+            gmx_fjsp_decrement_3rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+
+            /* Inner loop uses 387 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_3rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx01             = _fjsp_sub_v2r8(ix0,jx1);
+            dy01             = _fjsp_sub_v2r8(iy0,jy1);
+            dz01             = _fjsp_sub_v2r8(iz0,jz1);
+            dx02             = _fjsp_sub_v2r8(ix0,jx2);
+            dy02             = _fjsp_sub_v2r8(iy0,jy2);
+            dz02             = _fjsp_sub_v2r8(iz0,jz2);
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx11             = _fjsp_sub_v2r8(ix1,jx1);
+            dy11             = _fjsp_sub_v2r8(iy1,jy1);
+            dz11             = _fjsp_sub_v2r8(iz1,jz1);
+            dx12             = _fjsp_sub_v2r8(ix1,jx2);
+            dy12             = _fjsp_sub_v2r8(iy1,jy2);
+            dz12             = _fjsp_sub_v2r8(iz1,jz2);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+            dx21             = _fjsp_sub_v2r8(ix2,jx1);
+            dy21             = _fjsp_sub_v2r8(iy2,jy1);
+            dz21             = _fjsp_sub_v2r8(iz2,jz1);
+            dx22             = _fjsp_sub_v2r8(ix2,jx2);
+            dy22             = _fjsp_sub_v2r8(iy2,jy2);
+            dz22             = _fjsp_sub_v2r8(iz2,jz2);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq01            = gmx_fjsp_calc_rsq_v2r8(dx01,dy01,dz01);
+            rsq02            = gmx_fjsp_calc_rsq_v2r8(dx02,dy02,dz02);
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+            rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+            rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+            rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv01           = gmx_fjsp_invsqrt_v2r8(rsq01);
+            rinv02           = gmx_fjsp_invsqrt_v2r8(rsq02);
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+            rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+            rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+            rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+            rinvsq01         = _fjsp_mul_v2r8(rinv01,rinv01);
+            rinvsq02         = _fjsp_mul_v2r8(rinv02,rinv02);
+            rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+            rinvsq11         = _fjsp_mul_v2r8(rinv11,rinv11);
+            rinvsq12         = _fjsp_mul_v2r8(rinv12,rinv12);
+            rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+            rinvsq21         = _fjsp_mul_v2r8(rinv21,rinv21);
+            rinvsq22         = _fjsp_mul_v2r8(rinv22,rinv22);
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+            fjx1             = _fjsp_setzero_v2r8();
+            fjy1             = _fjsp_setzero_v2r8();
+            fjz1             = _fjsp_setzero_v2r8();
+            fjx2             = _fjsp_setzero_v2r8();
+            fjy2             = _fjsp_setzero_v2r8();
+            fjz2             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
+            {
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r00,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 8;
+            vfconv.i[1]     *= 8;
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq00,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq00,rinv00),crf));
+            felec            = _fjsp_mul_v2r8(qq00,_fjsp_msub_v2r8(rinv00,rinvsq00,krf2));
+
+            /* CUBIC SPLINE TABLE DISPERSION */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 2 );
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            vvdw6            = _fjsp_mul_v2r8(c6_00,VV);
+            FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+            fvdw6            = _fjsp_mul_v2r8(c6_00,FF);
+
+            /* CUBIC SPLINE TABLE REPULSION */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 4 );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 6 );
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            vvdw12           = _fjsp_mul_v2r8(c12_00,VV);
+            FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+            fvdw12           = _fjsp_mul_v2r8(c12_00,FF);
+            vvdw             = _fjsp_add_v2r8(vvdw12,vvdw6);
+            fvdw             = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_add_v2r8(fvdw6,fvdw12),_fjsp_mul_v2r8(vftabscale,rinv00)));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+            vvdw             = _fjsp_and_v2r8(vvdw,cutoff_mask);
+            vvdw             = _fjsp_unpacklo_v2r8(vvdw,_fjsp_setzero_v2r8());
+            vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
+
+            fscal            = _fjsp_add_v2r8(felec,fvdw);
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq01,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq01,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq01,rinv01),crf));
+            felec            = _fjsp_mul_v2r8(qq01,_fjsp_msub_v2r8(rinv01,rinvsq01,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq01,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx01,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy01,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz01,fscal,fiz0);
+            
+            fjx1             = _fjsp_madd_v2r8(dx01,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy01,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz01,fscal,fjz1);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq02,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq02,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq02,rinv02),crf));
+            felec            = _fjsp_mul_v2r8(qq02,_fjsp_msub_v2r8(rinv02,rinvsq02,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq02,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx02,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy02,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz02,fscal,fiz0);
+            
+            fjx2             = _fjsp_madd_v2r8(dx02,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy02,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz02,fscal,fjz2);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq10,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq10,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq10,rinv10),crf));
+            felec            = _fjsp_mul_v2r8(qq10,_fjsp_msub_v2r8(rinv10,rinvsq10,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq10,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq11,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq11,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq11,rinv11),crf));
+            felec            = _fjsp_mul_v2r8(qq11,_fjsp_msub_v2r8(rinv11,rinvsq11,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq11,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+            
+            fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq12,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq12,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq12,rinv12),crf));
+            felec            = _fjsp_mul_v2r8(qq12,_fjsp_msub_v2r8(rinv12,rinvsq12,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq12,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+            
+            fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq20,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq20,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq20,rinv20),crf));
+            felec            = _fjsp_mul_v2r8(qq20,_fjsp_msub_v2r8(rinv20,rinvsq20,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq20,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq21,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq21,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq21,rinv21),crf));
+            felec            = _fjsp_mul_v2r8(qq21,_fjsp_msub_v2r8(rinv21,rinvsq21,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq21,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+            
+            fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq22,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq22,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq22,rinv22),crf));
+            felec            = _fjsp_mul_v2r8(qq22,_fjsp_msub_v2r8(rinv22,rinvsq22,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq22,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+            
+            fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+
+            }
+
+            gmx_fjsp_decrement_3rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+
+            /* Inner loop uses 387 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_fjsp_update_iforce_3atom_swizzle_v2r8(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        ggid                        = gid[iidx];
+        /* Update potential energies */
+        gmx_fjsp_update_1pot_v2r8(velecsum,kernel_data->energygrp_elec+ggid);
+        gmx_fjsp_update_1pot_v2r8(vvdwsum,kernel_data->energygrp_vdw+ggid);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 20 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_VF,outeriter*20 + inneriter*387);
+}
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecRFCut_VdwCSTab_GeomW3W3_F_sparc64_hpc_ace_double
+ * Electrostatics interaction: ReactionField
+ * VdW interaction:            CubicSplineTable
+ * Geometry:                   Water3-Water3
+ * Calculate force/pot:        Force
+ */
+void
+nb_kernel_ElecRFCut_VdwCSTab_GeomW3W3_F_sparc64_hpc_ace_double
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwjidx0A,vdwjidx0B;
+    _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    int              vdwjidx1A,vdwjidx1B;
+    _fjsp_v2r8       jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+    int              vdwjidx2A,vdwjidx2B;
+    _fjsp_v2r8       jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+    _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    _fjsp_v2r8       dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
+    _fjsp_v2r8       dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
+    _fjsp_v2r8       dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    _fjsp_v2r8       dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+    _fjsp_v2r8       dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+    _fjsp_v2r8       dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    _fjsp_v2r8       dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+    _fjsp_v2r8       dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+    _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
+    _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
+    _fjsp_v2r8       rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF,twovfeps;
+    real             *vftab;
+    _fjsp_v2r8       itab_tmp;
+    _fjsp_v2r8       dummy_mask,cutoff_mask;
+    _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+    _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+    union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    krf              = gmx_fjsp_set1_v2r8(fr->ic->k_rf);
+    krf2             = gmx_fjsp_set1_v2r8(fr->ic->k_rf*2.0);
+    crf              = gmx_fjsp_set1_v2r8(fr->ic->c_rf);
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+
+    vftab            = kernel_data->table_vdw->data;
+    vftabscale       = gmx_fjsp_set1_v2r8(kernel_data->table_vdw->scale);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq0              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+0]));
+    iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+    iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+    vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+    jq0              = gmx_fjsp_set1_v2r8(charge[inr+0]);
+    jq1              = gmx_fjsp_set1_v2r8(charge[inr+1]);
+    jq2              = gmx_fjsp_set1_v2r8(charge[inr+2]);
+    vdwjidx0A        = 2*vdwtype[inr+0];
+    qq00             = _fjsp_mul_v2r8(iq0,jq0);
+    c6_00            = gmx_fjsp_set1_v2r8(vdwparam[vdwioffset0+vdwjidx0A]);
+    c12_00           = gmx_fjsp_set1_v2r8(vdwparam[vdwioffset0+vdwjidx0A+1]);
+    qq01             = _fjsp_mul_v2r8(iq0,jq1);
+    qq02             = _fjsp_mul_v2r8(iq0,jq2);
+    qq10             = _fjsp_mul_v2r8(iq1,jq0);
+    qq11             = _fjsp_mul_v2r8(iq1,jq1);
+    qq12             = _fjsp_mul_v2r8(iq1,jq2);
+    qq20             = _fjsp_mul_v2r8(iq2,jq0);
+    qq21             = _fjsp_mul_v2r8(iq2,jq1);
+    qq22             = _fjsp_mul_v2r8(iq2,jq2);
+
+    /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+    rcutoff_scalar   = fr->rcoulomb;
+    rcutoff          = gmx_fjsp_set1_v2r8(rcutoff_scalar);
+    rcutoff2         = _fjsp_mul_v2r8(rcutoff,rcutoff);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_fjsp_load_shift_and_3rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
+
+        fix0             = _fjsp_setzero_v2r8();
+        fiy0             = _fjsp_setzero_v2r8();
+        fiz0             = _fjsp_setzero_v2r8();
+        fix1             = _fjsp_setzero_v2r8();
+        fiy1             = _fjsp_setzero_v2r8();
+        fiz1             = _fjsp_setzero_v2r8();
+        fix2             = _fjsp_setzero_v2r8();
+        fiy2             = _fjsp_setzero_v2r8();
+        fiz2             = _fjsp_setzero_v2r8();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_3rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx01             = _fjsp_sub_v2r8(ix0,jx1);
+            dy01             = _fjsp_sub_v2r8(iy0,jy1);
+            dz01             = _fjsp_sub_v2r8(iz0,jz1);
+            dx02             = _fjsp_sub_v2r8(ix0,jx2);
+            dy02             = _fjsp_sub_v2r8(iy0,jy2);
+            dz02             = _fjsp_sub_v2r8(iz0,jz2);
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx11             = _fjsp_sub_v2r8(ix1,jx1);
+            dy11             = _fjsp_sub_v2r8(iy1,jy1);
+            dz11             = _fjsp_sub_v2r8(iz1,jz1);
+            dx12             = _fjsp_sub_v2r8(ix1,jx2);
+            dy12             = _fjsp_sub_v2r8(iy1,jy2);
+            dz12             = _fjsp_sub_v2r8(iz1,jz2);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+            dx21             = _fjsp_sub_v2r8(ix2,jx1);
+            dy21             = _fjsp_sub_v2r8(iy2,jy1);
+            dz21             = _fjsp_sub_v2r8(iz2,jz1);
+            dx22             = _fjsp_sub_v2r8(ix2,jx2);
+            dy22             = _fjsp_sub_v2r8(iy2,jy2);
+            dz22             = _fjsp_sub_v2r8(iz2,jz2);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq01            = gmx_fjsp_calc_rsq_v2r8(dx01,dy01,dz01);
+            rsq02            = gmx_fjsp_calc_rsq_v2r8(dx02,dy02,dz02);
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+            rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+            rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+            rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv01           = gmx_fjsp_invsqrt_v2r8(rsq01);
+            rinv02           = gmx_fjsp_invsqrt_v2r8(rsq02);
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+            rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+            rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+            rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+            rinvsq01         = _fjsp_mul_v2r8(rinv01,rinv01);
+            rinvsq02         = _fjsp_mul_v2r8(rinv02,rinv02);
+            rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+            rinvsq11         = _fjsp_mul_v2r8(rinv11,rinv11);
+            rinvsq12         = _fjsp_mul_v2r8(rinv12,rinv12);
+            rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+            rinvsq21         = _fjsp_mul_v2r8(rinv21,rinv21);
+            rinvsq22         = _fjsp_mul_v2r8(rinv22,rinv22);
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+            fjx1             = _fjsp_setzero_v2r8();
+            fjy1             = _fjsp_setzero_v2r8();
+            fjz1             = _fjsp_setzero_v2r8();
+            fjx2             = _fjsp_setzero_v2r8();
+            fjy2             = _fjsp_setzero_v2r8();
+            fjz2             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
+            {
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r00,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 8;
+            vfconv.i[1]     *= 8;
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq00,_fjsp_msub_v2r8(rinv00,rinvsq00,krf2));
+
+            /* CUBIC SPLINE TABLE DISPERSION */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 2 );
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 2 );
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+            FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+            fvdw6            = _fjsp_mul_v2r8(c6_00,FF);
+
+            /* CUBIC SPLINE TABLE REPULSION */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 4 );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 4 );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 6 );
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 6 );
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+            FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+            fvdw12           = _fjsp_mul_v2r8(c12_00,FF);
+            fvdw             = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_add_v2r8(fvdw6,fvdw12),_fjsp_mul_v2r8(vftabscale,rinv00)));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
+
+            fscal            = _fjsp_add_v2r8(felec,fvdw);
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq01,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq01,_fjsp_msub_v2r8(rinv01,rinvsq01,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq01,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx01,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy01,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz01,fscal,fiz0);
+            
+            fjx1             = _fjsp_madd_v2r8(dx01,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy01,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz01,fscal,fjz1);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq02,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq02,_fjsp_msub_v2r8(rinv02,rinvsq02,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq02,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx02,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy02,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz02,fscal,fiz0);
+            
+            fjx2             = _fjsp_madd_v2r8(dx02,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy02,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz02,fscal,fjz2);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq10,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq10,_fjsp_msub_v2r8(rinv10,rinvsq10,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq10,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq11,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq11,_fjsp_msub_v2r8(rinv11,rinvsq11,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq11,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+            
+            fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq12,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq12,_fjsp_msub_v2r8(rinv12,rinvsq12,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq12,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+            
+            fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq20,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq20,_fjsp_msub_v2r8(rinv20,rinvsq20,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq20,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq21,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq21,_fjsp_msub_v2r8(rinv21,rinvsq21,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq21,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+            
+            fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq22,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq22,_fjsp_msub_v2r8(rinv22,rinvsq22,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq22,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+            
+            fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+
+            }
+
+            gmx_fjsp_decrement_3rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+
+            /* Inner loop uses 324 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_3rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx01             = _fjsp_sub_v2r8(ix0,jx1);
+            dy01             = _fjsp_sub_v2r8(iy0,jy1);
+            dz01             = _fjsp_sub_v2r8(iz0,jz1);
+            dx02             = _fjsp_sub_v2r8(ix0,jx2);
+            dy02             = _fjsp_sub_v2r8(iy0,jy2);
+            dz02             = _fjsp_sub_v2r8(iz0,jz2);
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx11             = _fjsp_sub_v2r8(ix1,jx1);
+            dy11             = _fjsp_sub_v2r8(iy1,jy1);
+            dz11             = _fjsp_sub_v2r8(iz1,jz1);
+            dx12             = _fjsp_sub_v2r8(ix1,jx2);
+            dy12             = _fjsp_sub_v2r8(iy1,jy2);
+            dz12             = _fjsp_sub_v2r8(iz1,jz2);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+            dx21             = _fjsp_sub_v2r8(ix2,jx1);
+            dy21             = _fjsp_sub_v2r8(iy2,jy1);
+            dz21             = _fjsp_sub_v2r8(iz2,jz1);
+            dx22             = _fjsp_sub_v2r8(ix2,jx2);
+            dy22             = _fjsp_sub_v2r8(iy2,jy2);
+            dz22             = _fjsp_sub_v2r8(iz2,jz2);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq01            = gmx_fjsp_calc_rsq_v2r8(dx01,dy01,dz01);
+            rsq02            = gmx_fjsp_calc_rsq_v2r8(dx02,dy02,dz02);
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+            rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+            rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+            rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv01           = gmx_fjsp_invsqrt_v2r8(rsq01);
+            rinv02           = gmx_fjsp_invsqrt_v2r8(rsq02);
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+            rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+            rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+            rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+            rinvsq01         = _fjsp_mul_v2r8(rinv01,rinv01);
+            rinvsq02         = _fjsp_mul_v2r8(rinv02,rinv02);
+            rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+            rinvsq11         = _fjsp_mul_v2r8(rinv11,rinv11);
+            rinvsq12         = _fjsp_mul_v2r8(rinv12,rinv12);
+            rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+            rinvsq21         = _fjsp_mul_v2r8(rinv21,rinv21);
+            rinvsq22         = _fjsp_mul_v2r8(rinv22,rinv22);
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+            fjx1             = _fjsp_setzero_v2r8();
+            fjy1             = _fjsp_setzero_v2r8();
+            fjz1             = _fjsp_setzero_v2r8();
+            fjx2             = _fjsp_setzero_v2r8();
+            fjy2             = _fjsp_setzero_v2r8();
+            fjz2             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
+            {
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r00,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 8;
+            vfconv.i[1]     *= 8;
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq00,_fjsp_msub_v2r8(rinv00,rinvsq00,krf2));
+
+            /* CUBIC SPLINE TABLE DISPERSION */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 2 );
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+            FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+            fvdw6            = _fjsp_mul_v2r8(c6_00,FF);
+
+            /* CUBIC SPLINE TABLE REPULSION */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 4 );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 6 );
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+            FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+            fvdw12           = _fjsp_mul_v2r8(c12_00,FF);
+            fvdw             = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_add_v2r8(fvdw6,fvdw12),_fjsp_mul_v2r8(vftabscale,rinv00)));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
+
+            fscal            = _fjsp_add_v2r8(felec,fvdw);
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq01,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq01,_fjsp_msub_v2r8(rinv01,rinvsq01,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq01,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx01,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy01,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz01,fscal,fiz0);
+            
+            fjx1             = _fjsp_madd_v2r8(dx01,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy01,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz01,fscal,fjz1);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq02,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq02,_fjsp_msub_v2r8(rinv02,rinvsq02,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq02,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx02,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy02,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz02,fscal,fiz0);
+            
+            fjx2             = _fjsp_madd_v2r8(dx02,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy02,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz02,fscal,fjz2);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq10,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq10,_fjsp_msub_v2r8(rinv10,rinvsq10,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq10,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq11,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq11,_fjsp_msub_v2r8(rinv11,rinvsq11,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq11,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+            
+            fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq12,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq12,_fjsp_msub_v2r8(rinv12,rinvsq12,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq12,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+            
+            fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq20,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq20,_fjsp_msub_v2r8(rinv20,rinvsq20,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq20,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq21,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq21,_fjsp_msub_v2r8(rinv21,rinvsq21,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq21,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+            
+            fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq22,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq22,_fjsp_msub_v2r8(rinv22,rinvsq22,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq22,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+            
+            fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+
+            }
+
+            gmx_fjsp_decrement_3rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+
+            /* Inner loop uses 324 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_fjsp_update_iforce_3atom_swizzle_v2r8(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 18 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_F,outeriter*18 + inneriter*324);
+}
diff --git a/src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecRFCut_VdwCSTab_GeomW4P1_sparc64_hpc_ace_double.c b/src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecRFCut_VdwCSTab_GeomW4P1_sparc64_hpc_ace_double.c
new file mode 100644 (file)
index 0000000..75eba66
--- /dev/null
@@ -0,0 +1,1221 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2012, by the GROMACS development team, led by
+ * David van der Spoel, Berk Hess, Erik Lindahl, and including many
+ * others, as listed in the AUTHORS file in the top-level source
+ * directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*
+ * Note: this file was generated by the GROMACS sparc64_hpc_ace_double kernel generator.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "kernelutil_sparc64_hpc_ace_double.h"
+
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecRFCut_VdwCSTab_GeomW4P1_VF_sparc64_hpc_ace_double
+ * Electrostatics interaction: ReactionField
+ * VdW interaction:            CubicSplineTable
+ * Geometry:                   Water4-Particle
+ * Calculate force/pot:        PotentialAndForce
+ */
+void
+nb_kernel_ElecRFCut_VdwCSTab_GeomW4P1_VF_sparc64_hpc_ace_double
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwioffset3;
+    _fjsp_v2r8       ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+    int              vdwjidx0A,vdwjidx0B;
+    _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    _fjsp_v2r8       dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    _fjsp_v2r8       dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    _fjsp_v2r8       dx30,dy30,dz30,rsq30,rinv30,rinvsq30,r30,qq30,c6_30,c12_30;
+    _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
+    _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
+    _fjsp_v2r8       rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF,twovfeps;
+    real             *vftab;
+    _fjsp_v2r8       itab_tmp;
+    _fjsp_v2r8       dummy_mask,cutoff_mask;
+    _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+    _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+    union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    krf              = gmx_fjsp_set1_v2r8(fr->ic->k_rf);
+    krf2             = gmx_fjsp_set1_v2r8(fr->ic->k_rf*2.0);
+    crf              = gmx_fjsp_set1_v2r8(fr->ic->c_rf);
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+
+    vftab            = kernel_data->table_vdw->data;
+    vftabscale       = gmx_fjsp_set1_v2r8(kernel_data->table_vdw->scale);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+    iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+    iq3              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+3]));
+    vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+    /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+    rcutoff_scalar   = fr->rcoulomb;
+    rcutoff          = gmx_fjsp_set1_v2r8(rcutoff_scalar);
+    rcutoff2         = _fjsp_mul_v2r8(rcutoff,rcutoff);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_fjsp_load_shift_and_4rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
+
+        fix0             = _fjsp_setzero_v2r8();
+        fiy0             = _fjsp_setzero_v2r8();
+        fiz0             = _fjsp_setzero_v2r8();
+        fix1             = _fjsp_setzero_v2r8();
+        fiy1             = _fjsp_setzero_v2r8();
+        fiz1             = _fjsp_setzero_v2r8();
+        fix2             = _fjsp_setzero_v2r8();
+        fiy2             = _fjsp_setzero_v2r8();
+        fiz2             = _fjsp_setzero_v2r8();
+        fix3             = _fjsp_setzero_v2r8();
+        fiy3             = _fjsp_setzero_v2r8();
+        fiz3             = _fjsp_setzero_v2r8();
+
+        /* Reset potential sums */
+        velecsum         = _fjsp_setzero_v2r8();
+        vvdwsum          = _fjsp_setzero_v2r8();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+            dx30             = _fjsp_sub_v2r8(ix3,jx0);
+            dy30             = _fjsp_sub_v2r8(iy3,jy0);
+            dz30             = _fjsp_sub_v2r8(iz3,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+            rsq30            = gmx_fjsp_calc_rsq_v2r8(dx30,dy30,dz30);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+            rinv30           = gmx_fjsp_invsqrt_v2r8(rsq30);
+
+            rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+            rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+            rinvsq30         = _fjsp_mul_v2r8(rinv30,rinv30);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_fjsp_load_2real_swizzle_v2r8(charge+jnrA+0,charge+jnrB+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            gmx_fjsp_load_2pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,&c6_00,&c12_00);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r00,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 8;
+            vfconv.i[1]     *= 8;
+
+            /* CUBIC SPLINE TABLE DISPERSION */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 2 );
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 2 );
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            vvdw6            = _fjsp_mul_v2r8(c6_00,VV);
+            FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+            fvdw6            = _fjsp_mul_v2r8(c6_00,FF);
+
+            /* CUBIC SPLINE TABLE REPULSION */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 4 );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 4 );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 6 );
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 6 );
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            vvdw12           = _fjsp_mul_v2r8(c12_00,VV);
+            FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+            fvdw12           = _fjsp_mul_v2r8(c12_00,FF);
+            vvdw             = _fjsp_add_v2r8(vvdw12,vvdw6);
+            fvdw             = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_add_v2r8(fvdw6,fvdw12),_fjsp_mul_v2r8(vftabscale,rinv00)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
+
+            fscal            = fvdw;
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq10,rcutoff2))
+            {
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _fjsp_mul_v2r8(iq1,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq10,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq10,rinv10),crf));
+            felec            = _fjsp_mul_v2r8(qq10,_fjsp_msub_v2r8(rinv10,rinvsq10,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq10,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq20,rcutoff2))
+            {
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _fjsp_mul_v2r8(iq2,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq20,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq20,rinv20),crf));
+            felec            = _fjsp_mul_v2r8(qq20,_fjsp_msub_v2r8(rinv20,rinvsq20,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq20,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq30,rcutoff2))
+            {
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq30             = _fjsp_mul_v2r8(iq3,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq30,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq30,rinv30),crf));
+            felec            = _fjsp_mul_v2r8(qq30,_fjsp_msub_v2r8(rinv30,rinvsq30,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq30,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx30,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy30,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz30,fscal,fiz3);
+            
+            fjx0             = _fjsp_madd_v2r8(dx30,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy30,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz30,fscal,fjz0);
+
+            }
+
+            gmx_fjsp_decrement_1rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 179 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+            dx30             = _fjsp_sub_v2r8(ix3,jx0);
+            dy30             = _fjsp_sub_v2r8(iy3,jy0);
+            dz30             = _fjsp_sub_v2r8(iz3,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+            rsq30            = gmx_fjsp_calc_rsq_v2r8(dx30,dy30,dz30);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+            rinv30           = gmx_fjsp_invsqrt_v2r8(rsq30);
+
+            rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+            rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+            rinvsq30         = _fjsp_mul_v2r8(rinv30,rinv30);
+
+            /* Load parameters for j particles */
+            jq0              = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),charge+jnrA+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            gmx_fjsp_load_1pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,&c6_00,&c12_00);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r00,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 8;
+            vfconv.i[1]     *= 8;
+
+            /* CUBIC SPLINE TABLE DISPERSION */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 2 );
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            vvdw6            = _fjsp_mul_v2r8(c6_00,VV);
+            FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+            fvdw6            = _fjsp_mul_v2r8(c6_00,FF);
+
+            /* CUBIC SPLINE TABLE REPULSION */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 4 );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 6 );
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            vvdw12           = _fjsp_mul_v2r8(c12_00,VV);
+            FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+            fvdw12           = _fjsp_mul_v2r8(c12_00,FF);
+            vvdw             = _fjsp_add_v2r8(vvdw12,vvdw6);
+            fvdw             = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_add_v2r8(fvdw6,fvdw12),_fjsp_mul_v2r8(vftabscale,rinv00)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            vvdw             = _fjsp_unpacklo_v2r8(vvdw,_fjsp_setzero_v2r8());
+            vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
+
+            fscal            = fvdw;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq10,rcutoff2))
+            {
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _fjsp_mul_v2r8(iq1,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq10,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq10,rinv10),crf));
+            felec            = _fjsp_mul_v2r8(qq10,_fjsp_msub_v2r8(rinv10,rinvsq10,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq10,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq20,rcutoff2))
+            {
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _fjsp_mul_v2r8(iq2,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq20,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq20,rinv20),crf));
+            felec            = _fjsp_mul_v2r8(qq20,_fjsp_msub_v2r8(rinv20,rinvsq20,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq20,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq30,rcutoff2))
+            {
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq30             = _fjsp_mul_v2r8(iq3,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq30,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq30,rinv30),crf));
+            felec            = _fjsp_mul_v2r8(qq30,_fjsp_msub_v2r8(rinv30,rinvsq30,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq30,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx30,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy30,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz30,fscal,fiz3);
+            
+            fjx0             = _fjsp_madd_v2r8(dx30,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy30,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz30,fscal,fjz0);
+
+            }
+
+            gmx_fjsp_decrement_1rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 179 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_fjsp_update_iforce_4atom_swizzle_v2r8(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        ggid                        = gid[iidx];
+        /* Update potential energies */
+        gmx_fjsp_update_1pot_v2r8(velecsum,kernel_data->energygrp_elec+ggid);
+        gmx_fjsp_update_1pot_v2r8(vvdwsum,kernel_data->energygrp_vdw+ggid);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 26 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4_VF,outeriter*26 + inneriter*179);
+}
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecRFCut_VdwCSTab_GeomW4P1_F_sparc64_hpc_ace_double
+ * Electrostatics interaction: ReactionField
+ * VdW interaction:            CubicSplineTable
+ * Geometry:                   Water4-Particle
+ * Calculate force/pot:        Force
+ */
+void
+nb_kernel_ElecRFCut_VdwCSTab_GeomW4P1_F_sparc64_hpc_ace_double
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwioffset3;
+    _fjsp_v2r8       ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+    int              vdwjidx0A,vdwjidx0B;
+    _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    _fjsp_v2r8       dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    _fjsp_v2r8       dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    _fjsp_v2r8       dx30,dy30,dz30,rsq30,rinv30,rinvsq30,r30,qq30,c6_30,c12_30;
+    _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
+    _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
+    _fjsp_v2r8       rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF,twovfeps;
+    real             *vftab;
+    _fjsp_v2r8       itab_tmp;
+    _fjsp_v2r8       dummy_mask,cutoff_mask;
+    _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+    _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+    union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    krf              = gmx_fjsp_set1_v2r8(fr->ic->k_rf);
+    krf2             = gmx_fjsp_set1_v2r8(fr->ic->k_rf*2.0);
+    crf              = gmx_fjsp_set1_v2r8(fr->ic->c_rf);
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+
+    vftab            = kernel_data->table_vdw->data;
+    vftabscale       = gmx_fjsp_set1_v2r8(kernel_data->table_vdw->scale);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+    iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+    iq3              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+3]));
+    vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+    /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+    rcutoff_scalar   = fr->rcoulomb;
+    rcutoff          = gmx_fjsp_set1_v2r8(rcutoff_scalar);
+    rcutoff2         = _fjsp_mul_v2r8(rcutoff,rcutoff);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_fjsp_load_shift_and_4rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
+
+        fix0             = _fjsp_setzero_v2r8();
+        fiy0             = _fjsp_setzero_v2r8();
+        fiz0             = _fjsp_setzero_v2r8();
+        fix1             = _fjsp_setzero_v2r8();
+        fiy1             = _fjsp_setzero_v2r8();
+        fiz1             = _fjsp_setzero_v2r8();
+        fix2             = _fjsp_setzero_v2r8();
+        fiy2             = _fjsp_setzero_v2r8();
+        fiz2             = _fjsp_setzero_v2r8();
+        fix3             = _fjsp_setzero_v2r8();
+        fiy3             = _fjsp_setzero_v2r8();
+        fiz3             = _fjsp_setzero_v2r8();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+            dx30             = _fjsp_sub_v2r8(ix3,jx0);
+            dy30             = _fjsp_sub_v2r8(iy3,jy0);
+            dz30             = _fjsp_sub_v2r8(iz3,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+            rsq30            = gmx_fjsp_calc_rsq_v2r8(dx30,dy30,dz30);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+            rinv30           = gmx_fjsp_invsqrt_v2r8(rsq30);
+
+            rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+            rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+            rinvsq30         = _fjsp_mul_v2r8(rinv30,rinv30);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_fjsp_load_2real_swizzle_v2r8(charge+jnrA+0,charge+jnrB+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            gmx_fjsp_load_2pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,&c6_00,&c12_00);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r00,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 8;
+            vfconv.i[1]     *= 8;
+
+            /* CUBIC SPLINE TABLE DISPERSION */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 2 );
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 2 );
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+            FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+            fvdw6            = _fjsp_mul_v2r8(c6_00,FF);
+
+            /* CUBIC SPLINE TABLE REPULSION */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 4 );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 4 );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 6 );
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 6 );
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+            FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+            fvdw12           = _fjsp_mul_v2r8(c12_00,FF);
+            fvdw             = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_add_v2r8(fvdw6,fvdw12),_fjsp_mul_v2r8(vftabscale,rinv00)));
+
+            fscal            = fvdw;
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq10,rcutoff2))
+            {
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _fjsp_mul_v2r8(iq1,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq10,_fjsp_msub_v2r8(rinv10,rinvsq10,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq10,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq20,rcutoff2))
+            {
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _fjsp_mul_v2r8(iq2,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq20,_fjsp_msub_v2r8(rinv20,rinvsq20,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq20,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq30,rcutoff2))
+            {
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq30             = _fjsp_mul_v2r8(iq3,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq30,_fjsp_msub_v2r8(rinv30,rinvsq30,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq30,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx30,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy30,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz30,fscal,fiz3);
+            
+            fjx0             = _fjsp_madd_v2r8(dx30,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy30,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz30,fscal,fjz0);
+
+            }
+
+            gmx_fjsp_decrement_1rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 153 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+            dx30             = _fjsp_sub_v2r8(ix3,jx0);
+            dy30             = _fjsp_sub_v2r8(iy3,jy0);
+            dz30             = _fjsp_sub_v2r8(iz3,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+            rsq30            = gmx_fjsp_calc_rsq_v2r8(dx30,dy30,dz30);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+            rinv30           = gmx_fjsp_invsqrt_v2r8(rsq30);
+
+            rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+            rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+            rinvsq30         = _fjsp_mul_v2r8(rinv30,rinv30);
+
+            /* Load parameters for j particles */
+            jq0              = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),charge+jnrA+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            gmx_fjsp_load_1pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,&c6_00,&c12_00);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r00,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 8;
+            vfconv.i[1]     *= 8;
+
+            /* CUBIC SPLINE TABLE DISPERSION */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 2 );
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+            FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+            fvdw6            = _fjsp_mul_v2r8(c6_00,FF);
+
+            /* CUBIC SPLINE TABLE REPULSION */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 4 );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 6 );
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+            FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+            fvdw12           = _fjsp_mul_v2r8(c12_00,FF);
+            fvdw             = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_add_v2r8(fvdw6,fvdw12),_fjsp_mul_v2r8(vftabscale,rinv00)));
+
+            fscal            = fvdw;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq10,rcutoff2))
+            {
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _fjsp_mul_v2r8(iq1,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq10,_fjsp_msub_v2r8(rinv10,rinvsq10,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq10,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq20,rcutoff2))
+            {
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _fjsp_mul_v2r8(iq2,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq20,_fjsp_msub_v2r8(rinv20,rinvsq20,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq20,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq30,rcutoff2))
+            {
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq30             = _fjsp_mul_v2r8(iq3,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq30,_fjsp_msub_v2r8(rinv30,rinvsq30,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq30,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx30,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy30,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz30,fscal,fiz3);
+            
+            fjx0             = _fjsp_madd_v2r8(dx30,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy30,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz30,fscal,fjz0);
+
+            }
+
+            gmx_fjsp_decrement_1rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 153 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_fjsp_update_iforce_4atom_swizzle_v2r8(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 24 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4_F,outeriter*24 + inneriter*153);
+}
diff --git a/src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecRFCut_VdwCSTab_GeomW4W4_sparc64_hpc_ace_double.c b/src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecRFCut_VdwCSTab_GeomW4W4_sparc64_hpc_ace_double.c
new file mode 100644 (file)
index 0000000..99a176c
--- /dev/null
@@ -0,0 +1,2131 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2012, by the GROMACS development team, led by
+ * David van der Spoel, Berk Hess, Erik Lindahl, and including many
+ * others, as listed in the AUTHORS file in the top-level source
+ * directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*
+ * Note: this file was generated by the GROMACS sparc64_hpc_ace_double kernel generator.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "kernelutil_sparc64_hpc_ace_double.h"
+
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecRFCut_VdwCSTab_GeomW4W4_VF_sparc64_hpc_ace_double
+ * Electrostatics interaction: ReactionField
+ * VdW interaction:            CubicSplineTable
+ * Geometry:                   Water4-Water4
+ * Calculate force/pot:        PotentialAndForce
+ */
+void
+nb_kernel_ElecRFCut_VdwCSTab_GeomW4W4_VF_sparc64_hpc_ace_double
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwioffset3;
+    _fjsp_v2r8       ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+    int              vdwjidx0A,vdwjidx0B;
+    _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    int              vdwjidx1A,vdwjidx1B;
+    _fjsp_v2r8       jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+    int              vdwjidx2A,vdwjidx2B;
+    _fjsp_v2r8       jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+    int              vdwjidx3A,vdwjidx3B;
+    _fjsp_v2r8       jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
+    _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    _fjsp_v2r8       dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+    _fjsp_v2r8       dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+    _fjsp_v2r8       dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
+    _fjsp_v2r8       dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+    _fjsp_v2r8       dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+    _fjsp_v2r8       dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
+    _fjsp_v2r8       dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
+    _fjsp_v2r8       dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
+    _fjsp_v2r8       dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
+    _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
+    _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
+    _fjsp_v2r8       rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF,twovfeps;
+    real             *vftab;
+    _fjsp_v2r8       itab_tmp;
+    _fjsp_v2r8       dummy_mask,cutoff_mask;
+    _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+    _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+    union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    krf              = gmx_fjsp_set1_v2r8(fr->ic->k_rf);
+    krf2             = gmx_fjsp_set1_v2r8(fr->ic->k_rf*2.0);
+    crf              = gmx_fjsp_set1_v2r8(fr->ic->c_rf);
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+
+    vftab            = kernel_data->table_vdw->data;
+    vftabscale       = gmx_fjsp_set1_v2r8(kernel_data->table_vdw->scale);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+    iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+    iq3              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+3]));
+    vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+    jq1              = gmx_fjsp_set1_v2r8(charge[inr+1]);
+    jq2              = gmx_fjsp_set1_v2r8(charge[inr+2]);
+    jq3              = gmx_fjsp_set1_v2r8(charge[inr+3]);
+    vdwjidx0A        = 2*vdwtype[inr+0];
+    c6_00            = gmx_fjsp_set1_v2r8(vdwparam[vdwioffset0+vdwjidx0A]);
+    c12_00           = gmx_fjsp_set1_v2r8(vdwparam[vdwioffset0+vdwjidx0A+1]);
+    qq11             = _fjsp_mul_v2r8(iq1,jq1);
+    qq12             = _fjsp_mul_v2r8(iq1,jq2);
+    qq13             = _fjsp_mul_v2r8(iq1,jq3);
+    qq21             = _fjsp_mul_v2r8(iq2,jq1);
+    qq22             = _fjsp_mul_v2r8(iq2,jq2);
+    qq23             = _fjsp_mul_v2r8(iq2,jq3);
+    qq31             = _fjsp_mul_v2r8(iq3,jq1);
+    qq32             = _fjsp_mul_v2r8(iq3,jq2);
+    qq33             = _fjsp_mul_v2r8(iq3,jq3);
+
+    /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+    rcutoff_scalar   = fr->rcoulomb;
+    rcutoff          = gmx_fjsp_set1_v2r8(rcutoff_scalar);
+    rcutoff2         = _fjsp_mul_v2r8(rcutoff,rcutoff);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_fjsp_load_shift_and_4rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
+
+        fix0             = _fjsp_setzero_v2r8();
+        fiy0             = _fjsp_setzero_v2r8();
+        fiz0             = _fjsp_setzero_v2r8();
+        fix1             = _fjsp_setzero_v2r8();
+        fiy1             = _fjsp_setzero_v2r8();
+        fiz1             = _fjsp_setzero_v2r8();
+        fix2             = _fjsp_setzero_v2r8();
+        fiy2             = _fjsp_setzero_v2r8();
+        fiz2             = _fjsp_setzero_v2r8();
+        fix3             = _fjsp_setzero_v2r8();
+        fiy3             = _fjsp_setzero_v2r8();
+        fiz3             = _fjsp_setzero_v2r8();
+
+        /* Reset potential sums */
+        velecsum         = _fjsp_setzero_v2r8();
+        vvdwsum          = _fjsp_setzero_v2r8();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_4rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
+                                              &jy2,&jz2,&jx3,&jy3,&jz3);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx11             = _fjsp_sub_v2r8(ix1,jx1);
+            dy11             = _fjsp_sub_v2r8(iy1,jy1);
+            dz11             = _fjsp_sub_v2r8(iz1,jz1);
+            dx12             = _fjsp_sub_v2r8(ix1,jx2);
+            dy12             = _fjsp_sub_v2r8(iy1,jy2);
+            dz12             = _fjsp_sub_v2r8(iz1,jz2);
+            dx13             = _fjsp_sub_v2r8(ix1,jx3);
+            dy13             = _fjsp_sub_v2r8(iy1,jy3);
+            dz13             = _fjsp_sub_v2r8(iz1,jz3);
+            dx21             = _fjsp_sub_v2r8(ix2,jx1);
+            dy21             = _fjsp_sub_v2r8(iy2,jy1);
+            dz21             = _fjsp_sub_v2r8(iz2,jz1);
+            dx22             = _fjsp_sub_v2r8(ix2,jx2);
+            dy22             = _fjsp_sub_v2r8(iy2,jy2);
+            dz22             = _fjsp_sub_v2r8(iz2,jz2);
+            dx23             = _fjsp_sub_v2r8(ix2,jx3);
+            dy23             = _fjsp_sub_v2r8(iy2,jy3);
+            dz23             = _fjsp_sub_v2r8(iz2,jz3);
+            dx31             = _fjsp_sub_v2r8(ix3,jx1);
+            dy31             = _fjsp_sub_v2r8(iy3,jy1);
+            dz31             = _fjsp_sub_v2r8(iz3,jz1);
+            dx32             = _fjsp_sub_v2r8(ix3,jx2);
+            dy32             = _fjsp_sub_v2r8(iy3,jy2);
+            dz32             = _fjsp_sub_v2r8(iz3,jz2);
+            dx33             = _fjsp_sub_v2r8(ix3,jx3);
+            dy33             = _fjsp_sub_v2r8(iy3,jy3);
+            dz33             = _fjsp_sub_v2r8(iz3,jz3);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+            rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+            rsq13            = gmx_fjsp_calc_rsq_v2r8(dx13,dy13,dz13);
+            rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+            rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+            rsq23            = gmx_fjsp_calc_rsq_v2r8(dx23,dy23,dz23);
+            rsq31            = gmx_fjsp_calc_rsq_v2r8(dx31,dy31,dz31);
+            rsq32            = gmx_fjsp_calc_rsq_v2r8(dx32,dy32,dz32);
+            rsq33            = gmx_fjsp_calc_rsq_v2r8(dx33,dy33,dz33);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+            rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+            rinv13           = gmx_fjsp_invsqrt_v2r8(rsq13);
+            rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+            rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+            rinv23           = gmx_fjsp_invsqrt_v2r8(rsq23);
+            rinv31           = gmx_fjsp_invsqrt_v2r8(rsq31);
+            rinv32           = gmx_fjsp_invsqrt_v2r8(rsq32);
+            rinv33           = gmx_fjsp_invsqrt_v2r8(rsq33);
+
+            rinvsq11         = _fjsp_mul_v2r8(rinv11,rinv11);
+            rinvsq12         = _fjsp_mul_v2r8(rinv12,rinv12);
+            rinvsq13         = _fjsp_mul_v2r8(rinv13,rinv13);
+            rinvsq21         = _fjsp_mul_v2r8(rinv21,rinv21);
+            rinvsq22         = _fjsp_mul_v2r8(rinv22,rinv22);
+            rinvsq23         = _fjsp_mul_v2r8(rinv23,rinv23);
+            rinvsq31         = _fjsp_mul_v2r8(rinv31,rinv31);
+            rinvsq32         = _fjsp_mul_v2r8(rinv32,rinv32);
+            rinvsq33         = _fjsp_mul_v2r8(rinv33,rinv33);
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+            fjx1             = _fjsp_setzero_v2r8();
+            fjy1             = _fjsp_setzero_v2r8();
+            fjz1             = _fjsp_setzero_v2r8();
+            fjx2             = _fjsp_setzero_v2r8();
+            fjy2             = _fjsp_setzero_v2r8();
+            fjz2             = _fjsp_setzero_v2r8();
+            fjx3             = _fjsp_setzero_v2r8();
+            fjy3             = _fjsp_setzero_v2r8();
+            fjz3             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r00,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 8;
+            vfconv.i[1]     *= 8;
+
+            /* CUBIC SPLINE TABLE DISPERSION */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 2 );
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 2 );
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            vvdw6            = _fjsp_mul_v2r8(c6_00,VV);
+            FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+            fvdw6            = _fjsp_mul_v2r8(c6_00,FF);
+
+            /* CUBIC SPLINE TABLE REPULSION */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 4 );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 4 );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 6 );
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 6 );
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            vvdw12           = _fjsp_mul_v2r8(c12_00,VV);
+            FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+            fvdw12           = _fjsp_mul_v2r8(c12_00,FF);
+            vvdw             = _fjsp_add_v2r8(vvdw12,vvdw6);
+            fvdw             = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_add_v2r8(fvdw6,fvdw12),_fjsp_mul_v2r8(vftabscale,rinv00)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
+
+            fscal            = fvdw;
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq11,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq11,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq11,rinv11),crf));
+            felec            = _fjsp_mul_v2r8(qq11,_fjsp_msub_v2r8(rinv11,rinvsq11,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq11,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+            
+            fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq12,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq12,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq12,rinv12),crf));
+            felec            = _fjsp_mul_v2r8(qq12,_fjsp_msub_v2r8(rinv12,rinvsq12,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq12,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+            
+            fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq13,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq13,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq13,rinv13),crf));
+            felec            = _fjsp_mul_v2r8(qq13,_fjsp_msub_v2r8(rinv13,rinvsq13,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq13,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx13,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy13,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz13,fscal,fiz1);
+            
+            fjx3             = _fjsp_madd_v2r8(dx13,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy13,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz13,fscal,fjz3);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq21,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq21,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq21,rinv21),crf));
+            felec            = _fjsp_mul_v2r8(qq21,_fjsp_msub_v2r8(rinv21,rinvsq21,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq21,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+            
+            fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq22,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq22,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq22,rinv22),crf));
+            felec            = _fjsp_mul_v2r8(qq22,_fjsp_msub_v2r8(rinv22,rinvsq22,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq22,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+            
+            fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq23,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq23,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq23,rinv23),crf));
+            felec            = _fjsp_mul_v2r8(qq23,_fjsp_msub_v2r8(rinv23,rinvsq23,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq23,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx23,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy23,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz23,fscal,fiz2);
+            
+            fjx3             = _fjsp_madd_v2r8(dx23,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy23,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz23,fscal,fjz3);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq31,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq31,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq31,rinv31),crf));
+            felec            = _fjsp_mul_v2r8(qq31,_fjsp_msub_v2r8(rinv31,rinvsq31,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq31,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx31,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy31,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz31,fscal,fiz3);
+            
+            fjx1             = _fjsp_madd_v2r8(dx31,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy31,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz31,fscal,fjz1);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq32,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq32,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq32,rinv32),crf));
+            felec            = _fjsp_mul_v2r8(qq32,_fjsp_msub_v2r8(rinv32,rinvsq32,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq32,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx32,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy32,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz32,fscal,fiz3);
+            
+            fjx2             = _fjsp_madd_v2r8(dx32,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy32,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz32,fscal,fjz2);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq33,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq33,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq33,rinv33),crf));
+            felec            = _fjsp_mul_v2r8(qq33,_fjsp_msub_v2r8(rinv33,rinvsq33,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq33,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx33,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy33,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz33,fscal,fiz3);
+            
+            fjx3             = _fjsp_madd_v2r8(dx33,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy33,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz33,fscal,fjz3);
+
+            }
+
+            gmx_fjsp_decrement_4rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+
+            /* Inner loop uses 413 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_4rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
+                                              &jy2,&jz2,&jx3,&jy3,&jz3);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx11             = _fjsp_sub_v2r8(ix1,jx1);
+            dy11             = _fjsp_sub_v2r8(iy1,jy1);
+            dz11             = _fjsp_sub_v2r8(iz1,jz1);
+            dx12             = _fjsp_sub_v2r8(ix1,jx2);
+            dy12             = _fjsp_sub_v2r8(iy1,jy2);
+            dz12             = _fjsp_sub_v2r8(iz1,jz2);
+            dx13             = _fjsp_sub_v2r8(ix1,jx3);
+            dy13             = _fjsp_sub_v2r8(iy1,jy3);
+            dz13             = _fjsp_sub_v2r8(iz1,jz3);
+            dx21             = _fjsp_sub_v2r8(ix2,jx1);
+            dy21             = _fjsp_sub_v2r8(iy2,jy1);
+            dz21             = _fjsp_sub_v2r8(iz2,jz1);
+            dx22             = _fjsp_sub_v2r8(ix2,jx2);
+            dy22             = _fjsp_sub_v2r8(iy2,jy2);
+            dz22             = _fjsp_sub_v2r8(iz2,jz2);
+            dx23             = _fjsp_sub_v2r8(ix2,jx3);
+            dy23             = _fjsp_sub_v2r8(iy2,jy3);
+            dz23             = _fjsp_sub_v2r8(iz2,jz3);
+            dx31             = _fjsp_sub_v2r8(ix3,jx1);
+            dy31             = _fjsp_sub_v2r8(iy3,jy1);
+            dz31             = _fjsp_sub_v2r8(iz3,jz1);
+            dx32             = _fjsp_sub_v2r8(ix3,jx2);
+            dy32             = _fjsp_sub_v2r8(iy3,jy2);
+            dz32             = _fjsp_sub_v2r8(iz3,jz2);
+            dx33             = _fjsp_sub_v2r8(ix3,jx3);
+            dy33             = _fjsp_sub_v2r8(iy3,jy3);
+            dz33             = _fjsp_sub_v2r8(iz3,jz3);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+            rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+            rsq13            = gmx_fjsp_calc_rsq_v2r8(dx13,dy13,dz13);
+            rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+            rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+            rsq23            = gmx_fjsp_calc_rsq_v2r8(dx23,dy23,dz23);
+            rsq31            = gmx_fjsp_calc_rsq_v2r8(dx31,dy31,dz31);
+            rsq32            = gmx_fjsp_calc_rsq_v2r8(dx32,dy32,dz32);
+            rsq33            = gmx_fjsp_calc_rsq_v2r8(dx33,dy33,dz33);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+            rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+            rinv13           = gmx_fjsp_invsqrt_v2r8(rsq13);
+            rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+            rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+            rinv23           = gmx_fjsp_invsqrt_v2r8(rsq23);
+            rinv31           = gmx_fjsp_invsqrt_v2r8(rsq31);
+            rinv32           = gmx_fjsp_invsqrt_v2r8(rsq32);
+            rinv33           = gmx_fjsp_invsqrt_v2r8(rsq33);
+
+            rinvsq11         = _fjsp_mul_v2r8(rinv11,rinv11);
+            rinvsq12         = _fjsp_mul_v2r8(rinv12,rinv12);
+            rinvsq13         = _fjsp_mul_v2r8(rinv13,rinv13);
+            rinvsq21         = _fjsp_mul_v2r8(rinv21,rinv21);
+            rinvsq22         = _fjsp_mul_v2r8(rinv22,rinv22);
+            rinvsq23         = _fjsp_mul_v2r8(rinv23,rinv23);
+            rinvsq31         = _fjsp_mul_v2r8(rinv31,rinv31);
+            rinvsq32         = _fjsp_mul_v2r8(rinv32,rinv32);
+            rinvsq33         = _fjsp_mul_v2r8(rinv33,rinv33);
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+            fjx1             = _fjsp_setzero_v2r8();
+            fjy1             = _fjsp_setzero_v2r8();
+            fjz1             = _fjsp_setzero_v2r8();
+            fjx2             = _fjsp_setzero_v2r8();
+            fjy2             = _fjsp_setzero_v2r8();
+            fjz2             = _fjsp_setzero_v2r8();
+            fjx3             = _fjsp_setzero_v2r8();
+            fjy3             = _fjsp_setzero_v2r8();
+            fjz3             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r00,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 8;
+            vfconv.i[1]     *= 8;
+
+            /* CUBIC SPLINE TABLE DISPERSION */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 2 );
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            vvdw6            = _fjsp_mul_v2r8(c6_00,VV);
+            FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+            fvdw6            = _fjsp_mul_v2r8(c6_00,FF);
+
+            /* CUBIC SPLINE TABLE REPULSION */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 4 );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 6 );
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            vvdw12           = _fjsp_mul_v2r8(c12_00,VV);
+            FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+            fvdw12           = _fjsp_mul_v2r8(c12_00,FF);
+            vvdw             = _fjsp_add_v2r8(vvdw12,vvdw6);
+            fvdw             = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_add_v2r8(fvdw6,fvdw12),_fjsp_mul_v2r8(vftabscale,rinv00)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            vvdw             = _fjsp_unpacklo_v2r8(vvdw,_fjsp_setzero_v2r8());
+            vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
+
+            fscal            = fvdw;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq11,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq11,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq11,rinv11),crf));
+            felec            = _fjsp_mul_v2r8(qq11,_fjsp_msub_v2r8(rinv11,rinvsq11,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq11,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+            
+            fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq12,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq12,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq12,rinv12),crf));
+            felec            = _fjsp_mul_v2r8(qq12,_fjsp_msub_v2r8(rinv12,rinvsq12,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq12,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+            
+            fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq13,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq13,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq13,rinv13),crf));
+            felec            = _fjsp_mul_v2r8(qq13,_fjsp_msub_v2r8(rinv13,rinvsq13,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq13,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx13,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy13,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz13,fscal,fiz1);
+            
+            fjx3             = _fjsp_madd_v2r8(dx13,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy13,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz13,fscal,fjz3);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq21,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq21,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq21,rinv21),crf));
+            felec            = _fjsp_mul_v2r8(qq21,_fjsp_msub_v2r8(rinv21,rinvsq21,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq21,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+            
+            fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq22,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq22,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq22,rinv22),crf));
+            felec            = _fjsp_mul_v2r8(qq22,_fjsp_msub_v2r8(rinv22,rinvsq22,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq22,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+            
+            fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq23,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq23,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq23,rinv23),crf));
+            felec            = _fjsp_mul_v2r8(qq23,_fjsp_msub_v2r8(rinv23,rinvsq23,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq23,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx23,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy23,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz23,fscal,fiz2);
+            
+            fjx3             = _fjsp_madd_v2r8(dx23,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy23,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz23,fscal,fjz3);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq31,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq31,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq31,rinv31),crf));
+            felec            = _fjsp_mul_v2r8(qq31,_fjsp_msub_v2r8(rinv31,rinvsq31,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq31,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx31,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy31,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz31,fscal,fiz3);
+            
+            fjx1             = _fjsp_madd_v2r8(dx31,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy31,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz31,fscal,fjz1);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq32,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq32,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq32,rinv32),crf));
+            felec            = _fjsp_mul_v2r8(qq32,_fjsp_msub_v2r8(rinv32,rinvsq32,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq32,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx32,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy32,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz32,fscal,fiz3);
+            
+            fjx2             = _fjsp_madd_v2r8(dx32,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy32,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz32,fscal,fjz2);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq33,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq33,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq33,rinv33),crf));
+            felec            = _fjsp_mul_v2r8(qq33,_fjsp_msub_v2r8(rinv33,rinvsq33,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq33,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx33,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy33,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz33,fscal,fiz3);
+            
+            fjx3             = _fjsp_madd_v2r8(dx33,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy33,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz33,fscal,fjz3);
+
+            }
+
+            gmx_fjsp_decrement_4rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+
+            /* Inner loop uses 413 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_fjsp_update_iforce_4atom_swizzle_v2r8(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        ggid                        = gid[iidx];
+        /* Update potential energies */
+        gmx_fjsp_update_1pot_v2r8(velecsum,kernel_data->energygrp_elec+ggid);
+        gmx_fjsp_update_1pot_v2r8(vvdwsum,kernel_data->energygrp_vdw+ggid);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 26 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_VF,outeriter*26 + inneriter*413);
+}
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecRFCut_VdwCSTab_GeomW4W4_F_sparc64_hpc_ace_double
+ * Electrostatics interaction: ReactionField
+ * VdW interaction:            CubicSplineTable
+ * Geometry:                   Water4-Water4
+ * Calculate force/pot:        Force
+ */
+void
+nb_kernel_ElecRFCut_VdwCSTab_GeomW4W4_F_sparc64_hpc_ace_double
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwioffset3;
+    _fjsp_v2r8       ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+    int              vdwjidx0A,vdwjidx0B;
+    _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    int              vdwjidx1A,vdwjidx1B;
+    _fjsp_v2r8       jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+    int              vdwjidx2A,vdwjidx2B;
+    _fjsp_v2r8       jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+    int              vdwjidx3A,vdwjidx3B;
+    _fjsp_v2r8       jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
+    _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    _fjsp_v2r8       dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+    _fjsp_v2r8       dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+    _fjsp_v2r8       dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
+    _fjsp_v2r8       dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+    _fjsp_v2r8       dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+    _fjsp_v2r8       dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
+    _fjsp_v2r8       dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
+    _fjsp_v2r8       dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
+    _fjsp_v2r8       dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
+    _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
+    _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
+    _fjsp_v2r8       rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF,twovfeps;
+    real             *vftab;
+    _fjsp_v2r8       itab_tmp;
+    _fjsp_v2r8       dummy_mask,cutoff_mask;
+    _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+    _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+    union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    krf              = gmx_fjsp_set1_v2r8(fr->ic->k_rf);
+    krf2             = gmx_fjsp_set1_v2r8(fr->ic->k_rf*2.0);
+    crf              = gmx_fjsp_set1_v2r8(fr->ic->c_rf);
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+
+    vftab            = kernel_data->table_vdw->data;
+    vftabscale       = gmx_fjsp_set1_v2r8(kernel_data->table_vdw->scale);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+    iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+    iq3              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+3]));
+    vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+    jq1              = gmx_fjsp_set1_v2r8(charge[inr+1]);
+    jq2              = gmx_fjsp_set1_v2r8(charge[inr+2]);
+    jq3              = gmx_fjsp_set1_v2r8(charge[inr+3]);
+    vdwjidx0A        = 2*vdwtype[inr+0];
+    c6_00            = gmx_fjsp_set1_v2r8(vdwparam[vdwioffset0+vdwjidx0A]);
+    c12_00           = gmx_fjsp_set1_v2r8(vdwparam[vdwioffset0+vdwjidx0A+1]);
+    qq11             = _fjsp_mul_v2r8(iq1,jq1);
+    qq12             = _fjsp_mul_v2r8(iq1,jq2);
+    qq13             = _fjsp_mul_v2r8(iq1,jq3);
+    qq21             = _fjsp_mul_v2r8(iq2,jq1);
+    qq22             = _fjsp_mul_v2r8(iq2,jq2);
+    qq23             = _fjsp_mul_v2r8(iq2,jq3);
+    qq31             = _fjsp_mul_v2r8(iq3,jq1);
+    qq32             = _fjsp_mul_v2r8(iq3,jq2);
+    qq33             = _fjsp_mul_v2r8(iq3,jq3);
+
+    /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+    rcutoff_scalar   = fr->rcoulomb;
+    rcutoff          = gmx_fjsp_set1_v2r8(rcutoff_scalar);
+    rcutoff2         = _fjsp_mul_v2r8(rcutoff,rcutoff);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_fjsp_load_shift_and_4rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
+
+        fix0             = _fjsp_setzero_v2r8();
+        fiy0             = _fjsp_setzero_v2r8();
+        fiz0             = _fjsp_setzero_v2r8();
+        fix1             = _fjsp_setzero_v2r8();
+        fiy1             = _fjsp_setzero_v2r8();
+        fiz1             = _fjsp_setzero_v2r8();
+        fix2             = _fjsp_setzero_v2r8();
+        fiy2             = _fjsp_setzero_v2r8();
+        fiz2             = _fjsp_setzero_v2r8();
+        fix3             = _fjsp_setzero_v2r8();
+        fiy3             = _fjsp_setzero_v2r8();
+        fiz3             = _fjsp_setzero_v2r8();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_4rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
+                                              &jy2,&jz2,&jx3,&jy3,&jz3);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx11             = _fjsp_sub_v2r8(ix1,jx1);
+            dy11             = _fjsp_sub_v2r8(iy1,jy1);
+            dz11             = _fjsp_sub_v2r8(iz1,jz1);
+            dx12             = _fjsp_sub_v2r8(ix1,jx2);
+            dy12             = _fjsp_sub_v2r8(iy1,jy2);
+            dz12             = _fjsp_sub_v2r8(iz1,jz2);
+            dx13             = _fjsp_sub_v2r8(ix1,jx3);
+            dy13             = _fjsp_sub_v2r8(iy1,jy3);
+            dz13             = _fjsp_sub_v2r8(iz1,jz3);
+            dx21             = _fjsp_sub_v2r8(ix2,jx1);
+            dy21             = _fjsp_sub_v2r8(iy2,jy1);
+            dz21             = _fjsp_sub_v2r8(iz2,jz1);
+            dx22             = _fjsp_sub_v2r8(ix2,jx2);
+            dy22             = _fjsp_sub_v2r8(iy2,jy2);
+            dz22             = _fjsp_sub_v2r8(iz2,jz2);
+            dx23             = _fjsp_sub_v2r8(ix2,jx3);
+            dy23             = _fjsp_sub_v2r8(iy2,jy3);
+            dz23             = _fjsp_sub_v2r8(iz2,jz3);
+            dx31             = _fjsp_sub_v2r8(ix3,jx1);
+            dy31             = _fjsp_sub_v2r8(iy3,jy1);
+            dz31             = _fjsp_sub_v2r8(iz3,jz1);
+            dx32             = _fjsp_sub_v2r8(ix3,jx2);
+            dy32             = _fjsp_sub_v2r8(iy3,jy2);
+            dz32             = _fjsp_sub_v2r8(iz3,jz2);
+            dx33             = _fjsp_sub_v2r8(ix3,jx3);
+            dy33             = _fjsp_sub_v2r8(iy3,jy3);
+            dz33             = _fjsp_sub_v2r8(iz3,jz3);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+            rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+            rsq13            = gmx_fjsp_calc_rsq_v2r8(dx13,dy13,dz13);
+            rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+            rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+            rsq23            = gmx_fjsp_calc_rsq_v2r8(dx23,dy23,dz23);
+            rsq31            = gmx_fjsp_calc_rsq_v2r8(dx31,dy31,dz31);
+            rsq32            = gmx_fjsp_calc_rsq_v2r8(dx32,dy32,dz32);
+            rsq33            = gmx_fjsp_calc_rsq_v2r8(dx33,dy33,dz33);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+            rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+            rinv13           = gmx_fjsp_invsqrt_v2r8(rsq13);
+            rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+            rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+            rinv23           = gmx_fjsp_invsqrt_v2r8(rsq23);
+            rinv31           = gmx_fjsp_invsqrt_v2r8(rsq31);
+            rinv32           = gmx_fjsp_invsqrt_v2r8(rsq32);
+            rinv33           = gmx_fjsp_invsqrt_v2r8(rsq33);
+
+            rinvsq11         = _fjsp_mul_v2r8(rinv11,rinv11);
+            rinvsq12         = _fjsp_mul_v2r8(rinv12,rinv12);
+            rinvsq13         = _fjsp_mul_v2r8(rinv13,rinv13);
+            rinvsq21         = _fjsp_mul_v2r8(rinv21,rinv21);
+            rinvsq22         = _fjsp_mul_v2r8(rinv22,rinv22);
+            rinvsq23         = _fjsp_mul_v2r8(rinv23,rinv23);
+            rinvsq31         = _fjsp_mul_v2r8(rinv31,rinv31);
+            rinvsq32         = _fjsp_mul_v2r8(rinv32,rinv32);
+            rinvsq33         = _fjsp_mul_v2r8(rinv33,rinv33);
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+            fjx1             = _fjsp_setzero_v2r8();
+            fjy1             = _fjsp_setzero_v2r8();
+            fjz1             = _fjsp_setzero_v2r8();
+            fjx2             = _fjsp_setzero_v2r8();
+            fjy2             = _fjsp_setzero_v2r8();
+            fjz2             = _fjsp_setzero_v2r8();
+            fjx3             = _fjsp_setzero_v2r8();
+            fjy3             = _fjsp_setzero_v2r8();
+            fjz3             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r00,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 8;
+            vfconv.i[1]     *= 8;
+
+            /* CUBIC SPLINE TABLE DISPERSION */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 2 );
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 2 );
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+            FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+            fvdw6            = _fjsp_mul_v2r8(c6_00,FF);
+
+            /* CUBIC SPLINE TABLE REPULSION */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 4 );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 4 );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 6 );
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 6 );
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+            FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+            fvdw12           = _fjsp_mul_v2r8(c12_00,FF);
+            fvdw             = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_add_v2r8(fvdw6,fvdw12),_fjsp_mul_v2r8(vftabscale,rinv00)));
+
+            fscal            = fvdw;
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq11,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq11,_fjsp_msub_v2r8(rinv11,rinvsq11,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq11,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+            
+            fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq12,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq12,_fjsp_msub_v2r8(rinv12,rinvsq12,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq12,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+            
+            fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq13,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq13,_fjsp_msub_v2r8(rinv13,rinvsq13,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq13,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx13,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy13,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz13,fscal,fiz1);
+            
+            fjx3             = _fjsp_madd_v2r8(dx13,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy13,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz13,fscal,fjz3);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq21,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq21,_fjsp_msub_v2r8(rinv21,rinvsq21,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq21,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+            
+            fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq22,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq22,_fjsp_msub_v2r8(rinv22,rinvsq22,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq22,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+            
+            fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq23,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq23,_fjsp_msub_v2r8(rinv23,rinvsq23,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq23,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx23,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy23,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz23,fscal,fiz2);
+            
+            fjx3             = _fjsp_madd_v2r8(dx23,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy23,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz23,fscal,fjz3);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq31,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq31,_fjsp_msub_v2r8(rinv31,rinvsq31,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq31,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx31,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy31,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz31,fscal,fiz3);
+            
+            fjx1             = _fjsp_madd_v2r8(dx31,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy31,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz31,fscal,fjz1);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq32,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq32,_fjsp_msub_v2r8(rinv32,rinvsq32,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq32,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx32,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy32,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz32,fscal,fiz3);
+            
+            fjx2             = _fjsp_madd_v2r8(dx32,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy32,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz32,fscal,fjz2);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq33,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq33,_fjsp_msub_v2r8(rinv33,rinvsq33,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq33,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx33,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy33,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz33,fscal,fiz3);
+            
+            fjx3             = _fjsp_madd_v2r8(dx33,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy33,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz33,fscal,fjz3);
+
+            }
+
+            gmx_fjsp_decrement_4rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+
+            /* Inner loop uses 351 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_4rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
+                                              &jy2,&jz2,&jx3,&jy3,&jz3);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx11             = _fjsp_sub_v2r8(ix1,jx1);
+            dy11             = _fjsp_sub_v2r8(iy1,jy1);
+            dz11             = _fjsp_sub_v2r8(iz1,jz1);
+            dx12             = _fjsp_sub_v2r8(ix1,jx2);
+            dy12             = _fjsp_sub_v2r8(iy1,jy2);
+            dz12             = _fjsp_sub_v2r8(iz1,jz2);
+            dx13             = _fjsp_sub_v2r8(ix1,jx3);
+            dy13             = _fjsp_sub_v2r8(iy1,jy3);
+            dz13             = _fjsp_sub_v2r8(iz1,jz3);
+            dx21             = _fjsp_sub_v2r8(ix2,jx1);
+            dy21             = _fjsp_sub_v2r8(iy2,jy1);
+            dz21             = _fjsp_sub_v2r8(iz2,jz1);
+            dx22             = _fjsp_sub_v2r8(ix2,jx2);
+            dy22             = _fjsp_sub_v2r8(iy2,jy2);
+            dz22             = _fjsp_sub_v2r8(iz2,jz2);
+            dx23             = _fjsp_sub_v2r8(ix2,jx3);
+            dy23             = _fjsp_sub_v2r8(iy2,jy3);
+            dz23             = _fjsp_sub_v2r8(iz2,jz3);
+            dx31             = _fjsp_sub_v2r8(ix3,jx1);
+            dy31             = _fjsp_sub_v2r8(iy3,jy1);
+            dz31             = _fjsp_sub_v2r8(iz3,jz1);
+            dx32             = _fjsp_sub_v2r8(ix3,jx2);
+            dy32             = _fjsp_sub_v2r8(iy3,jy2);
+            dz32             = _fjsp_sub_v2r8(iz3,jz2);
+            dx33             = _fjsp_sub_v2r8(ix3,jx3);
+            dy33             = _fjsp_sub_v2r8(iy3,jy3);
+            dz33             = _fjsp_sub_v2r8(iz3,jz3);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+            rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+            rsq13            = gmx_fjsp_calc_rsq_v2r8(dx13,dy13,dz13);
+            rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+            rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+            rsq23            = gmx_fjsp_calc_rsq_v2r8(dx23,dy23,dz23);
+            rsq31            = gmx_fjsp_calc_rsq_v2r8(dx31,dy31,dz31);
+            rsq32            = gmx_fjsp_calc_rsq_v2r8(dx32,dy32,dz32);
+            rsq33            = gmx_fjsp_calc_rsq_v2r8(dx33,dy33,dz33);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+            rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+            rinv13           = gmx_fjsp_invsqrt_v2r8(rsq13);
+            rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+            rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+            rinv23           = gmx_fjsp_invsqrt_v2r8(rsq23);
+            rinv31           = gmx_fjsp_invsqrt_v2r8(rsq31);
+            rinv32           = gmx_fjsp_invsqrt_v2r8(rsq32);
+            rinv33           = gmx_fjsp_invsqrt_v2r8(rsq33);
+
+            rinvsq11         = _fjsp_mul_v2r8(rinv11,rinv11);
+            rinvsq12         = _fjsp_mul_v2r8(rinv12,rinv12);
+            rinvsq13         = _fjsp_mul_v2r8(rinv13,rinv13);
+            rinvsq21         = _fjsp_mul_v2r8(rinv21,rinv21);
+            rinvsq22         = _fjsp_mul_v2r8(rinv22,rinv22);
+            rinvsq23         = _fjsp_mul_v2r8(rinv23,rinv23);
+            rinvsq31         = _fjsp_mul_v2r8(rinv31,rinv31);
+            rinvsq32         = _fjsp_mul_v2r8(rinv32,rinv32);
+            rinvsq33         = _fjsp_mul_v2r8(rinv33,rinv33);
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+            fjx1             = _fjsp_setzero_v2r8();
+            fjy1             = _fjsp_setzero_v2r8();
+            fjz1             = _fjsp_setzero_v2r8();
+            fjx2             = _fjsp_setzero_v2r8();
+            fjy2             = _fjsp_setzero_v2r8();
+            fjz2             = _fjsp_setzero_v2r8();
+            fjx3             = _fjsp_setzero_v2r8();
+            fjy3             = _fjsp_setzero_v2r8();
+            fjz3             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r00,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 8;
+            vfconv.i[1]     *= 8;
+
+            /* CUBIC SPLINE TABLE DISPERSION */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 2 );
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+            FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+            fvdw6            = _fjsp_mul_v2r8(c6_00,FF);
+
+            /* CUBIC SPLINE TABLE REPULSION */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 4 );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 6 );
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+            FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+            fvdw12           = _fjsp_mul_v2r8(c12_00,FF);
+            fvdw             = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_add_v2r8(fvdw6,fvdw12),_fjsp_mul_v2r8(vftabscale,rinv00)));
+
+            fscal            = fvdw;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq11,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq11,_fjsp_msub_v2r8(rinv11,rinvsq11,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq11,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+            
+            fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq12,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq12,_fjsp_msub_v2r8(rinv12,rinvsq12,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq12,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+            
+            fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq13,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq13,_fjsp_msub_v2r8(rinv13,rinvsq13,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq13,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx13,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy13,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz13,fscal,fiz1);
+            
+            fjx3             = _fjsp_madd_v2r8(dx13,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy13,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz13,fscal,fjz3);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq21,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq21,_fjsp_msub_v2r8(rinv21,rinvsq21,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq21,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+            
+            fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq22,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq22,_fjsp_msub_v2r8(rinv22,rinvsq22,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq22,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+            
+            fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq23,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq23,_fjsp_msub_v2r8(rinv23,rinvsq23,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq23,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx23,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy23,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz23,fscal,fiz2);
+            
+            fjx3             = _fjsp_madd_v2r8(dx23,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy23,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz23,fscal,fjz3);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq31,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq31,_fjsp_msub_v2r8(rinv31,rinvsq31,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq31,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx31,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy31,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz31,fscal,fiz3);
+            
+            fjx1             = _fjsp_madd_v2r8(dx31,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy31,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz31,fscal,fjz1);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq32,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq32,_fjsp_msub_v2r8(rinv32,rinvsq32,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq32,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx32,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy32,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz32,fscal,fiz3);
+            
+            fjx2             = _fjsp_madd_v2r8(dx32,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy32,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz32,fscal,fjz2);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq33,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq33,_fjsp_msub_v2r8(rinv33,rinvsq33,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq33,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx33,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy33,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz33,fscal,fiz3);
+            
+            fjx3             = _fjsp_madd_v2r8(dx33,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy33,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz33,fscal,fjz3);
+
+            }
+
+            gmx_fjsp_decrement_4rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+
+            /* Inner loop uses 351 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_fjsp_update_iforce_4atom_swizzle_v2r8(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 24 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_F,outeriter*24 + inneriter*351);
+}
diff --git a/src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecRFCut_VdwLJSh_GeomP1P1_sparc64_hpc_ace_double.c b/src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecRFCut_VdwLJSh_GeomP1P1_sparc64_hpc_ace_double.c
new file mode 100644 (file)
index 0000000..aba7840
--- /dev/null
@@ -0,0 +1,607 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2012, by the GROMACS development team, led by
+ * David van der Spoel, Berk Hess, Erik Lindahl, and including many
+ * others, as listed in the AUTHORS file in the top-level source
+ * directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*
+ * Note: this file was generated by the GROMACS sparc64_hpc_ace_double kernel generator.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "kernelutil_sparc64_hpc_ace_double.h"
+
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecRFCut_VdwLJSh_GeomP1P1_VF_sparc64_hpc_ace_double
+ * Electrostatics interaction: ReactionField
+ * VdW interaction:            LennardJones
+ * Geometry:                   Particle-Particle
+ * Calculate force/pot:        PotentialAndForce
+ */
+void
+nb_kernel_ElecRFCut_VdwLJSh_GeomP1P1_VF_sparc64_hpc_ace_double
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwjidx0A,vdwjidx0B;
+    _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
+    _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
+    _fjsp_v2r8       itab_tmp;
+    _fjsp_v2r8       dummy_mask,cutoff_mask;
+    _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+    _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+    union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    krf              = gmx_fjsp_set1_v2r8(fr->ic->k_rf);
+    krf2             = gmx_fjsp_set1_v2r8(fr->ic->k_rf*2.0);
+    crf              = gmx_fjsp_set1_v2r8(fr->ic->c_rf);
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+
+    /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+    rcutoff_scalar   = fr->rcoulomb;
+    rcutoff          = gmx_fjsp_set1_v2r8(rcutoff_scalar);
+    rcutoff2         = _fjsp_mul_v2r8(rcutoff,rcutoff);
+
+    sh_vdw_invrcut6  = gmx_fjsp_set1_v2r8(fr->ic->sh_invrc6);
+    rvdw             = gmx_fjsp_set1_v2r8(fr->rvdw);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_fjsp_load_shift_and_1rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,&ix0,&iy0,&iz0);
+
+        fix0             = _fjsp_setzero_v2r8();
+        fiy0             = _fjsp_setzero_v2r8();
+        fiz0             = _fjsp_setzero_v2r8();
+
+        /* Load parameters for i particles */
+        iq0              = _fjsp_mul_v2r8(facel,gmx_fjsp_load1_v2r8(charge+inr+0));
+        vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+        /* Reset potential sums */
+        velecsum         = _fjsp_setzero_v2r8();
+        vvdwsum          = _fjsp_setzero_v2r8();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_fjsp_load_2real_swizzle_v2r8(charge+jnrA+0,charge+jnrB+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
+            {
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _fjsp_mul_v2r8(iq0,jq0);
+            gmx_fjsp_load_2pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,&c6_00,&c12_00);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq00,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq00,rinv00),crf));
+            felec            = _fjsp_mul_v2r8(qq00,_fjsp_msub_v2r8(rinv00,rinvsq00,krf2));
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+            vvdw6            = _fjsp_mul_v2r8(c6_00,rinvsix);
+            vvdw12           = _fjsp_mul_v2r8(c12_00,_fjsp_mul_v2r8(rinvsix,rinvsix));
+            vvdw             = _fjsp_msub_v2r8(_fjsp_nmsub_v2r8(c12_00,_fjsp_mul_v2r8(sh_vdw_invrcut6,sh_vdw_invrcut6),vvdw12),one_twelfth,
+                                           _fjsp_mul_v2r8(_fjsp_nmsub_v2r8( c6_00,sh_vdw_invrcut6,vvdw6),one_sixth));
+            fvdw             = _fjsp_mul_v2r8(_fjsp_sub_v2r8(vvdw12,vvdw6),rinvsq00);
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+            vvdw             = _fjsp_and_v2r8(vvdw,cutoff_mask);
+            vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
+
+            fscal            = _fjsp_add_v2r8(felec,fvdw);
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            gmx_fjsp_decrement_fma_1rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fscal,dx00,dy00,dz00);
+
+            }
+
+            /* Inner loop uses 57 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+
+            /* Load parameters for j particles */
+            jq0              = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),charge+jnrA+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
+            {
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _fjsp_mul_v2r8(iq0,jq0);
+            gmx_fjsp_load_1pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,&c6_00,&c12_00);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq00,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq00,rinv00),crf));
+            felec            = _fjsp_mul_v2r8(qq00,_fjsp_msub_v2r8(rinv00,rinvsq00,krf2));
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+            vvdw6            = _fjsp_mul_v2r8(c6_00,rinvsix);
+            vvdw12           = _fjsp_mul_v2r8(c12_00,_fjsp_mul_v2r8(rinvsix,rinvsix));
+            vvdw             = _fjsp_msub_v2r8(_fjsp_nmsub_v2r8(c12_00,_fjsp_mul_v2r8(sh_vdw_invrcut6,sh_vdw_invrcut6),vvdw12),one_twelfth,
+                                           _fjsp_mul_v2r8(_fjsp_nmsub_v2r8( c6_00,sh_vdw_invrcut6,vvdw6),one_sixth));
+            fvdw             = _fjsp_mul_v2r8(_fjsp_sub_v2r8(vvdw12,vvdw6),rinvsq00);
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+            vvdw             = _fjsp_and_v2r8(vvdw,cutoff_mask);
+            vvdw             = _fjsp_unpacklo_v2r8(vvdw,_fjsp_setzero_v2r8());
+            vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
+
+            fscal            = _fjsp_add_v2r8(felec,fvdw);
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            gmx_fjsp_decrement_fma_1rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fscal,dx00,dy00,dz00);
+
+            }
+
+            /* Inner loop uses 57 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_fjsp_update_iforce_1atom_swizzle_v2r8(fix0,fiy0,fiz0,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        ggid                        = gid[iidx];
+        /* Update potential energies */
+        gmx_fjsp_update_1pot_v2r8(velecsum,kernel_data->energygrp_elec+ggid);
+        gmx_fjsp_update_1pot_v2r8(vvdwsum,kernel_data->energygrp_vdw+ggid);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 9 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_VF,outeriter*9 + inneriter*57);
+}
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecRFCut_VdwLJSh_GeomP1P1_F_sparc64_hpc_ace_double
+ * Electrostatics interaction: ReactionField
+ * VdW interaction:            LennardJones
+ * Geometry:                   Particle-Particle
+ * Calculate force/pot:        Force
+ */
+void
+nb_kernel_ElecRFCut_VdwLJSh_GeomP1P1_F_sparc64_hpc_ace_double
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwjidx0A,vdwjidx0B;
+    _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
+    _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
+    _fjsp_v2r8       itab_tmp;
+    _fjsp_v2r8       dummy_mask,cutoff_mask;
+    _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+    _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+    union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    krf              = gmx_fjsp_set1_v2r8(fr->ic->k_rf);
+    krf2             = gmx_fjsp_set1_v2r8(fr->ic->k_rf*2.0);
+    crf              = gmx_fjsp_set1_v2r8(fr->ic->c_rf);
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+
+    /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+    rcutoff_scalar   = fr->rcoulomb;
+    rcutoff          = gmx_fjsp_set1_v2r8(rcutoff_scalar);
+    rcutoff2         = _fjsp_mul_v2r8(rcutoff,rcutoff);
+
+    sh_vdw_invrcut6  = gmx_fjsp_set1_v2r8(fr->ic->sh_invrc6);
+    rvdw             = gmx_fjsp_set1_v2r8(fr->rvdw);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_fjsp_load_shift_and_1rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,&ix0,&iy0,&iz0);
+
+        fix0             = _fjsp_setzero_v2r8();
+        fiy0             = _fjsp_setzero_v2r8();
+        fiz0             = _fjsp_setzero_v2r8();
+
+        /* Load parameters for i particles */
+        iq0              = _fjsp_mul_v2r8(facel,gmx_fjsp_load1_v2r8(charge+inr+0));
+        vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_fjsp_load_2real_swizzle_v2r8(charge+jnrA+0,charge+jnrB+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
+            {
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _fjsp_mul_v2r8(iq0,jq0);
+            gmx_fjsp_load_2pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,&c6_00,&c12_00);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq00,_fjsp_msub_v2r8(rinv00,rinvsq00,krf2));
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+            fvdw             = _fjsp_mul_v2r8(_fjsp_msub_v2r8(c12_00,rinvsix,c6_00),_fjsp_mul_v2r8(rinvsix,rinvsq00));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
+
+            fscal            = _fjsp_add_v2r8(felec,fvdw);
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            gmx_fjsp_decrement_fma_1rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fscal,dx00,dy00,dz00);
+
+            }
+
+            /* Inner loop uses 40 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+
+            /* Load parameters for j particles */
+            jq0              = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),charge+jnrA+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
+            {
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _fjsp_mul_v2r8(iq0,jq0);
+            gmx_fjsp_load_1pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,&c6_00,&c12_00);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq00,_fjsp_msub_v2r8(rinv00,rinvsq00,krf2));
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+            fvdw             = _fjsp_mul_v2r8(_fjsp_msub_v2r8(c12_00,rinvsix,c6_00),_fjsp_mul_v2r8(rinvsix,rinvsq00));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
+
+            fscal            = _fjsp_add_v2r8(felec,fvdw);
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            gmx_fjsp_decrement_fma_1rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fscal,dx00,dy00,dz00);
+
+            }
+
+            /* Inner loop uses 40 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_fjsp_update_iforce_1atom_swizzle_v2r8(fix0,fiy0,fiz0,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 7 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_F,outeriter*7 + inneriter*40);
+}
diff --git a/src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecRFCut_VdwLJSh_GeomW3P1_sparc64_hpc_ace_double.c b/src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecRFCut_VdwLJSh_GeomW3P1_sparc64_hpc_ace_double.c
new file mode 100644 (file)
index 0000000..8a88a91
--- /dev/null
@@ -0,0 +1,989 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2012, by the GROMACS development team, led by
+ * David van der Spoel, Berk Hess, Erik Lindahl, and including many
+ * others, as listed in the AUTHORS file in the top-level source
+ * directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*
+ * Note: this file was generated by the GROMACS sparc64_hpc_ace_double kernel generator.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "kernelutil_sparc64_hpc_ace_double.h"
+
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecRFCut_VdwLJSh_GeomW3P1_VF_sparc64_hpc_ace_double
+ * Electrostatics interaction: ReactionField
+ * VdW interaction:            LennardJones
+ * Geometry:                   Water3-Particle
+ * Calculate force/pot:        PotentialAndForce
+ */
+void
+nb_kernel_ElecRFCut_VdwLJSh_GeomW3P1_VF_sparc64_hpc_ace_double
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwjidx0A,vdwjidx0B;
+    _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    _fjsp_v2r8       dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    _fjsp_v2r8       dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
+    _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
+    _fjsp_v2r8       itab_tmp;
+    _fjsp_v2r8       dummy_mask,cutoff_mask;
+    _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+    _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+    union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    krf              = gmx_fjsp_set1_v2r8(fr->ic->k_rf);
+    krf2             = gmx_fjsp_set1_v2r8(fr->ic->k_rf*2.0);
+    crf              = gmx_fjsp_set1_v2r8(fr->ic->c_rf);
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq0              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+0]));
+    iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+    iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+    vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+    /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+    rcutoff_scalar   = fr->rcoulomb;
+    rcutoff          = gmx_fjsp_set1_v2r8(rcutoff_scalar);
+    rcutoff2         = _fjsp_mul_v2r8(rcutoff,rcutoff);
+
+    sh_vdw_invrcut6  = gmx_fjsp_set1_v2r8(fr->ic->sh_invrc6);
+    rvdw             = gmx_fjsp_set1_v2r8(fr->rvdw);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_fjsp_load_shift_and_3rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
+
+        fix0             = _fjsp_setzero_v2r8();
+        fiy0             = _fjsp_setzero_v2r8();
+        fiz0             = _fjsp_setzero_v2r8();
+        fix1             = _fjsp_setzero_v2r8();
+        fiy1             = _fjsp_setzero_v2r8();
+        fiz1             = _fjsp_setzero_v2r8();
+        fix2             = _fjsp_setzero_v2r8();
+        fiy2             = _fjsp_setzero_v2r8();
+        fiz2             = _fjsp_setzero_v2r8();
+
+        /* Reset potential sums */
+        velecsum         = _fjsp_setzero_v2r8();
+        vvdwsum          = _fjsp_setzero_v2r8();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+            rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+            rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_fjsp_load_2real_swizzle_v2r8(charge+jnrA+0,charge+jnrB+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
+            {
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _fjsp_mul_v2r8(iq0,jq0);
+            gmx_fjsp_load_2pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,&c6_00,&c12_00);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq00,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq00,rinv00),crf));
+            felec            = _fjsp_mul_v2r8(qq00,_fjsp_msub_v2r8(rinv00,rinvsq00,krf2));
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+            vvdw6            = _fjsp_mul_v2r8(c6_00,rinvsix);
+            vvdw12           = _fjsp_mul_v2r8(c12_00,_fjsp_mul_v2r8(rinvsix,rinvsix));
+            vvdw             = _fjsp_msub_v2r8(_fjsp_nmsub_v2r8(c12_00,_fjsp_mul_v2r8(sh_vdw_invrcut6,sh_vdw_invrcut6),vvdw12),one_twelfth,
+                                           _fjsp_mul_v2r8(_fjsp_nmsub_v2r8( c6_00,sh_vdw_invrcut6,vvdw6),one_sixth));
+            fvdw             = _fjsp_mul_v2r8(_fjsp_sub_v2r8(vvdw12,vvdw6),rinvsq00);
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+            vvdw             = _fjsp_and_v2r8(vvdw,cutoff_mask);
+            vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
+
+            fscal            = _fjsp_add_v2r8(felec,fvdw);
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq10,rcutoff2))
+            {
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _fjsp_mul_v2r8(iq1,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq10,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq10,rinv10),crf));
+            felec            = _fjsp_mul_v2r8(qq10,_fjsp_msub_v2r8(rinv10,rinvsq10,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq10,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq20,rcutoff2))
+            {
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _fjsp_mul_v2r8(iq2,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq20,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq20,rinv20),crf));
+            felec            = _fjsp_mul_v2r8(qq20,_fjsp_msub_v2r8(rinv20,rinvsq20,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq20,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            }
+
+            gmx_fjsp_decrement_1rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 138 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+            rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+            rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+
+            /* Load parameters for j particles */
+            jq0              = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),charge+jnrA+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
+            {
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _fjsp_mul_v2r8(iq0,jq0);
+            gmx_fjsp_load_1pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,&c6_00,&c12_00);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq00,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq00,rinv00),crf));
+            felec            = _fjsp_mul_v2r8(qq00,_fjsp_msub_v2r8(rinv00,rinvsq00,krf2));
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+            vvdw6            = _fjsp_mul_v2r8(c6_00,rinvsix);
+            vvdw12           = _fjsp_mul_v2r8(c12_00,_fjsp_mul_v2r8(rinvsix,rinvsix));
+            vvdw             = _fjsp_msub_v2r8(_fjsp_nmsub_v2r8(c12_00,_fjsp_mul_v2r8(sh_vdw_invrcut6,sh_vdw_invrcut6),vvdw12),one_twelfth,
+                                           _fjsp_mul_v2r8(_fjsp_nmsub_v2r8( c6_00,sh_vdw_invrcut6,vvdw6),one_sixth));
+            fvdw             = _fjsp_mul_v2r8(_fjsp_sub_v2r8(vvdw12,vvdw6),rinvsq00);
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+            vvdw             = _fjsp_and_v2r8(vvdw,cutoff_mask);
+            vvdw             = _fjsp_unpacklo_v2r8(vvdw,_fjsp_setzero_v2r8());
+            vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
+
+            fscal            = _fjsp_add_v2r8(felec,fvdw);
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq10,rcutoff2))
+            {
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _fjsp_mul_v2r8(iq1,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq10,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq10,rinv10),crf));
+            felec            = _fjsp_mul_v2r8(qq10,_fjsp_msub_v2r8(rinv10,rinvsq10,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq10,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq20,rcutoff2))
+            {
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _fjsp_mul_v2r8(iq2,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq20,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq20,rinv20),crf));
+            felec            = _fjsp_mul_v2r8(qq20,_fjsp_msub_v2r8(rinv20,rinvsq20,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq20,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            }
+
+            gmx_fjsp_decrement_1rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 138 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_fjsp_update_iforce_3atom_swizzle_v2r8(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        ggid                        = gid[iidx];
+        /* Update potential energies */
+        gmx_fjsp_update_1pot_v2r8(velecsum,kernel_data->energygrp_elec+ggid);
+        gmx_fjsp_update_1pot_v2r8(vvdwsum,kernel_data->energygrp_vdw+ggid);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 20 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3_VF,outeriter*20 + inneriter*138);
+}
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecRFCut_VdwLJSh_GeomW3P1_F_sparc64_hpc_ace_double
+ * Electrostatics interaction: ReactionField
+ * VdW interaction:            LennardJones
+ * Geometry:                   Water3-Particle
+ * Calculate force/pot:        Force
+ */
+void
+nb_kernel_ElecRFCut_VdwLJSh_GeomW3P1_F_sparc64_hpc_ace_double
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwjidx0A,vdwjidx0B;
+    _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    _fjsp_v2r8       dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    _fjsp_v2r8       dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
+    _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
+    _fjsp_v2r8       itab_tmp;
+    _fjsp_v2r8       dummy_mask,cutoff_mask;
+    _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+    _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+    union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    krf              = gmx_fjsp_set1_v2r8(fr->ic->k_rf);
+    krf2             = gmx_fjsp_set1_v2r8(fr->ic->k_rf*2.0);
+    crf              = gmx_fjsp_set1_v2r8(fr->ic->c_rf);
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq0              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+0]));
+    iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+    iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+    vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+    /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+    rcutoff_scalar   = fr->rcoulomb;
+    rcutoff          = gmx_fjsp_set1_v2r8(rcutoff_scalar);
+    rcutoff2         = _fjsp_mul_v2r8(rcutoff,rcutoff);
+
+    sh_vdw_invrcut6  = gmx_fjsp_set1_v2r8(fr->ic->sh_invrc6);
+    rvdw             = gmx_fjsp_set1_v2r8(fr->rvdw);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_fjsp_load_shift_and_3rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
+
+        fix0             = _fjsp_setzero_v2r8();
+        fiy0             = _fjsp_setzero_v2r8();
+        fiz0             = _fjsp_setzero_v2r8();
+        fix1             = _fjsp_setzero_v2r8();
+        fiy1             = _fjsp_setzero_v2r8();
+        fiz1             = _fjsp_setzero_v2r8();
+        fix2             = _fjsp_setzero_v2r8();
+        fiy2             = _fjsp_setzero_v2r8();
+        fiz2             = _fjsp_setzero_v2r8();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+            rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+            rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_fjsp_load_2real_swizzle_v2r8(charge+jnrA+0,charge+jnrB+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
+            {
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _fjsp_mul_v2r8(iq0,jq0);
+            gmx_fjsp_load_2pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,&c6_00,&c12_00);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq00,_fjsp_msub_v2r8(rinv00,rinvsq00,krf2));
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+            fvdw             = _fjsp_mul_v2r8(_fjsp_msub_v2r8(c12_00,rinvsix,c6_00),_fjsp_mul_v2r8(rinvsix,rinvsq00));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
+
+            fscal            = _fjsp_add_v2r8(felec,fvdw);
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq10,rcutoff2))
+            {
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _fjsp_mul_v2r8(iq1,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq10,_fjsp_msub_v2r8(rinv10,rinvsq10,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq10,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq20,rcutoff2))
+            {
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _fjsp_mul_v2r8(iq2,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq20,_fjsp_msub_v2r8(rinv20,rinvsq20,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq20,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            }
+
+            gmx_fjsp_decrement_1rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 109 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+            rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+            rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+
+            /* Load parameters for j particles */
+            jq0              = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),charge+jnrA+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
+            {
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _fjsp_mul_v2r8(iq0,jq0);
+            gmx_fjsp_load_1pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,&c6_00,&c12_00);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq00,_fjsp_msub_v2r8(rinv00,rinvsq00,krf2));
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+            fvdw             = _fjsp_mul_v2r8(_fjsp_msub_v2r8(c12_00,rinvsix,c6_00),_fjsp_mul_v2r8(rinvsix,rinvsq00));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
+
+            fscal            = _fjsp_add_v2r8(felec,fvdw);
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq10,rcutoff2))
+            {
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _fjsp_mul_v2r8(iq1,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq10,_fjsp_msub_v2r8(rinv10,rinvsq10,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq10,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq20,rcutoff2))
+            {
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _fjsp_mul_v2r8(iq2,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq20,_fjsp_msub_v2r8(rinv20,rinvsq20,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq20,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            }
+
+            gmx_fjsp_decrement_1rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 109 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_fjsp_update_iforce_3atom_swizzle_v2r8(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 18 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3_F,outeriter*18 + inneriter*109);
+}
diff --git a/src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecRFCut_VdwLJSh_GeomW3W3_sparc64_hpc_ace_double.c b/src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecRFCut_VdwLJSh_GeomW3W3_sparc64_hpc_ace_double.c
new file mode 100644 (file)
index 0000000..d3b9655
--- /dev/null
@@ -0,0 +1,1887 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2012, by the GROMACS development team, led by
+ * David van der Spoel, Berk Hess, Erik Lindahl, and including many
+ * others, as listed in the AUTHORS file in the top-level source
+ * directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*
+ * Note: this file was generated by the GROMACS sparc64_hpc_ace_double kernel generator.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "kernelutil_sparc64_hpc_ace_double.h"
+
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecRFCut_VdwLJSh_GeomW3W3_VF_sparc64_hpc_ace_double
+ * Electrostatics interaction: ReactionField
+ * VdW interaction:            LennardJones
+ * Geometry:                   Water3-Water3
+ * Calculate force/pot:        PotentialAndForce
+ */
+void
+nb_kernel_ElecRFCut_VdwLJSh_GeomW3W3_VF_sparc64_hpc_ace_double
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwjidx0A,vdwjidx0B;
+    _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    int              vdwjidx1A,vdwjidx1B;
+    _fjsp_v2r8       jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+    int              vdwjidx2A,vdwjidx2B;
+    _fjsp_v2r8       jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+    _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    _fjsp_v2r8       dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
+    _fjsp_v2r8       dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
+    _fjsp_v2r8       dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    _fjsp_v2r8       dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+    _fjsp_v2r8       dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+    _fjsp_v2r8       dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    _fjsp_v2r8       dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+    _fjsp_v2r8       dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+    _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
+    _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
+    _fjsp_v2r8       itab_tmp;
+    _fjsp_v2r8       dummy_mask,cutoff_mask;
+    _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+    _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+    union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    krf              = gmx_fjsp_set1_v2r8(fr->ic->k_rf);
+    krf2             = gmx_fjsp_set1_v2r8(fr->ic->k_rf*2.0);
+    crf              = gmx_fjsp_set1_v2r8(fr->ic->c_rf);
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq0              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+0]));
+    iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+    iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+    vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+    jq0              = gmx_fjsp_set1_v2r8(charge[inr+0]);
+    jq1              = gmx_fjsp_set1_v2r8(charge[inr+1]);
+    jq2              = gmx_fjsp_set1_v2r8(charge[inr+2]);
+    vdwjidx0A        = 2*vdwtype[inr+0];
+    qq00             = _fjsp_mul_v2r8(iq0,jq0);
+    c6_00            = gmx_fjsp_set1_v2r8(vdwparam[vdwioffset0+vdwjidx0A]);
+    c12_00           = gmx_fjsp_set1_v2r8(vdwparam[vdwioffset0+vdwjidx0A+1]);
+    qq01             = _fjsp_mul_v2r8(iq0,jq1);
+    qq02             = _fjsp_mul_v2r8(iq0,jq2);
+    qq10             = _fjsp_mul_v2r8(iq1,jq0);
+    qq11             = _fjsp_mul_v2r8(iq1,jq1);
+    qq12             = _fjsp_mul_v2r8(iq1,jq2);
+    qq20             = _fjsp_mul_v2r8(iq2,jq0);
+    qq21             = _fjsp_mul_v2r8(iq2,jq1);
+    qq22             = _fjsp_mul_v2r8(iq2,jq2);
+
+    /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+    rcutoff_scalar   = fr->rcoulomb;
+    rcutoff          = gmx_fjsp_set1_v2r8(rcutoff_scalar);
+    rcutoff2         = _fjsp_mul_v2r8(rcutoff,rcutoff);
+
+    sh_vdw_invrcut6  = gmx_fjsp_set1_v2r8(fr->ic->sh_invrc6);
+    rvdw             = gmx_fjsp_set1_v2r8(fr->rvdw);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_fjsp_load_shift_and_3rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
+
+        fix0             = _fjsp_setzero_v2r8();
+        fiy0             = _fjsp_setzero_v2r8();
+        fiz0             = _fjsp_setzero_v2r8();
+        fix1             = _fjsp_setzero_v2r8();
+        fiy1             = _fjsp_setzero_v2r8();
+        fiz1             = _fjsp_setzero_v2r8();
+        fix2             = _fjsp_setzero_v2r8();
+        fiy2             = _fjsp_setzero_v2r8();
+        fiz2             = _fjsp_setzero_v2r8();
+
+        /* Reset potential sums */
+        velecsum         = _fjsp_setzero_v2r8();
+        vvdwsum          = _fjsp_setzero_v2r8();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_3rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx01             = _fjsp_sub_v2r8(ix0,jx1);
+            dy01             = _fjsp_sub_v2r8(iy0,jy1);
+            dz01             = _fjsp_sub_v2r8(iz0,jz1);
+            dx02             = _fjsp_sub_v2r8(ix0,jx2);
+            dy02             = _fjsp_sub_v2r8(iy0,jy2);
+            dz02             = _fjsp_sub_v2r8(iz0,jz2);
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx11             = _fjsp_sub_v2r8(ix1,jx1);
+            dy11             = _fjsp_sub_v2r8(iy1,jy1);
+            dz11             = _fjsp_sub_v2r8(iz1,jz1);
+            dx12             = _fjsp_sub_v2r8(ix1,jx2);
+            dy12             = _fjsp_sub_v2r8(iy1,jy2);
+            dz12             = _fjsp_sub_v2r8(iz1,jz2);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+            dx21             = _fjsp_sub_v2r8(ix2,jx1);
+            dy21             = _fjsp_sub_v2r8(iy2,jy1);
+            dz21             = _fjsp_sub_v2r8(iz2,jz1);
+            dx22             = _fjsp_sub_v2r8(ix2,jx2);
+            dy22             = _fjsp_sub_v2r8(iy2,jy2);
+            dz22             = _fjsp_sub_v2r8(iz2,jz2);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq01            = gmx_fjsp_calc_rsq_v2r8(dx01,dy01,dz01);
+            rsq02            = gmx_fjsp_calc_rsq_v2r8(dx02,dy02,dz02);
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+            rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+            rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+            rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv01           = gmx_fjsp_invsqrt_v2r8(rsq01);
+            rinv02           = gmx_fjsp_invsqrt_v2r8(rsq02);
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+            rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+            rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+            rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+            rinvsq01         = _fjsp_mul_v2r8(rinv01,rinv01);
+            rinvsq02         = _fjsp_mul_v2r8(rinv02,rinv02);
+            rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+            rinvsq11         = _fjsp_mul_v2r8(rinv11,rinv11);
+            rinvsq12         = _fjsp_mul_v2r8(rinv12,rinv12);
+            rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+            rinvsq21         = _fjsp_mul_v2r8(rinv21,rinv21);
+            rinvsq22         = _fjsp_mul_v2r8(rinv22,rinv22);
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+            fjx1             = _fjsp_setzero_v2r8();
+            fjy1             = _fjsp_setzero_v2r8();
+            fjz1             = _fjsp_setzero_v2r8();
+            fjx2             = _fjsp_setzero_v2r8();
+            fjy2             = _fjsp_setzero_v2r8();
+            fjz2             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq00,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq00,rinv00),crf));
+            felec            = _fjsp_mul_v2r8(qq00,_fjsp_msub_v2r8(rinv00,rinvsq00,krf2));
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+            vvdw6            = _fjsp_mul_v2r8(c6_00,rinvsix);
+            vvdw12           = _fjsp_mul_v2r8(c12_00,_fjsp_mul_v2r8(rinvsix,rinvsix));
+            vvdw             = _fjsp_msub_v2r8(_fjsp_nmsub_v2r8(c12_00,_fjsp_mul_v2r8(sh_vdw_invrcut6,sh_vdw_invrcut6),vvdw12),one_twelfth,
+                                           _fjsp_mul_v2r8(_fjsp_nmsub_v2r8( c6_00,sh_vdw_invrcut6,vvdw6),one_sixth));
+            fvdw             = _fjsp_mul_v2r8(_fjsp_sub_v2r8(vvdw12,vvdw6),rinvsq00);
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+            vvdw             = _fjsp_and_v2r8(vvdw,cutoff_mask);
+            vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
+
+            fscal            = _fjsp_add_v2r8(felec,fvdw);
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq01,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq01,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq01,rinv01),crf));
+            felec            = _fjsp_mul_v2r8(qq01,_fjsp_msub_v2r8(rinv01,rinvsq01,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq01,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx01,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy01,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz01,fscal,fiz0);
+            
+            fjx1             = _fjsp_madd_v2r8(dx01,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy01,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz01,fscal,fjz1);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq02,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq02,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq02,rinv02),crf));
+            felec            = _fjsp_mul_v2r8(qq02,_fjsp_msub_v2r8(rinv02,rinvsq02,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq02,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx02,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy02,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz02,fscal,fiz0);
+            
+            fjx2             = _fjsp_madd_v2r8(dx02,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy02,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz02,fscal,fjz2);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq10,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq10,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq10,rinv10),crf));
+            felec            = _fjsp_mul_v2r8(qq10,_fjsp_msub_v2r8(rinv10,rinvsq10,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq10,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq11,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq11,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq11,rinv11),crf));
+            felec            = _fjsp_mul_v2r8(qq11,_fjsp_msub_v2r8(rinv11,rinvsq11,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq11,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+            
+            fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq12,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq12,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq12,rinv12),crf));
+            felec            = _fjsp_mul_v2r8(qq12,_fjsp_msub_v2r8(rinv12,rinvsq12,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq12,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+            
+            fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq20,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq20,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq20,rinv20),crf));
+            felec            = _fjsp_mul_v2r8(qq20,_fjsp_msub_v2r8(rinv20,rinvsq20,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq20,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq21,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq21,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq21,rinv21),crf));
+            felec            = _fjsp_mul_v2r8(qq21,_fjsp_msub_v2r8(rinv21,rinvsq21,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq21,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+            
+            fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq22,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq22,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq22,rinv22),crf));
+            felec            = _fjsp_mul_v2r8(qq22,_fjsp_msub_v2r8(rinv22,rinvsq22,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq22,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+            
+            fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+
+            }
+
+            gmx_fjsp_decrement_3rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+
+            /* Inner loop uses 369 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_3rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx01             = _fjsp_sub_v2r8(ix0,jx1);
+            dy01             = _fjsp_sub_v2r8(iy0,jy1);
+            dz01             = _fjsp_sub_v2r8(iz0,jz1);
+            dx02             = _fjsp_sub_v2r8(ix0,jx2);
+            dy02             = _fjsp_sub_v2r8(iy0,jy2);
+            dz02             = _fjsp_sub_v2r8(iz0,jz2);
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx11             = _fjsp_sub_v2r8(ix1,jx1);
+            dy11             = _fjsp_sub_v2r8(iy1,jy1);
+            dz11             = _fjsp_sub_v2r8(iz1,jz1);
+            dx12             = _fjsp_sub_v2r8(ix1,jx2);
+            dy12             = _fjsp_sub_v2r8(iy1,jy2);
+            dz12             = _fjsp_sub_v2r8(iz1,jz2);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+            dx21             = _fjsp_sub_v2r8(ix2,jx1);
+            dy21             = _fjsp_sub_v2r8(iy2,jy1);
+            dz21             = _fjsp_sub_v2r8(iz2,jz1);
+            dx22             = _fjsp_sub_v2r8(ix2,jx2);
+            dy22             = _fjsp_sub_v2r8(iy2,jy2);
+            dz22             = _fjsp_sub_v2r8(iz2,jz2);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq01            = gmx_fjsp_calc_rsq_v2r8(dx01,dy01,dz01);
+            rsq02            = gmx_fjsp_calc_rsq_v2r8(dx02,dy02,dz02);
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+            rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+            rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+            rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv01           = gmx_fjsp_invsqrt_v2r8(rsq01);
+            rinv02           = gmx_fjsp_invsqrt_v2r8(rsq02);
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+            rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+            rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+            rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+            rinvsq01         = _fjsp_mul_v2r8(rinv01,rinv01);
+            rinvsq02         = _fjsp_mul_v2r8(rinv02,rinv02);
+            rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+            rinvsq11         = _fjsp_mul_v2r8(rinv11,rinv11);
+            rinvsq12         = _fjsp_mul_v2r8(rinv12,rinv12);
+            rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+            rinvsq21         = _fjsp_mul_v2r8(rinv21,rinv21);
+            rinvsq22         = _fjsp_mul_v2r8(rinv22,rinv22);
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+            fjx1             = _fjsp_setzero_v2r8();
+            fjy1             = _fjsp_setzero_v2r8();
+            fjz1             = _fjsp_setzero_v2r8();
+            fjx2             = _fjsp_setzero_v2r8();
+            fjy2             = _fjsp_setzero_v2r8();
+            fjz2             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq00,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq00,rinv00),crf));
+            felec            = _fjsp_mul_v2r8(qq00,_fjsp_msub_v2r8(rinv00,rinvsq00,krf2));
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+            vvdw6            = _fjsp_mul_v2r8(c6_00,rinvsix);
+            vvdw12           = _fjsp_mul_v2r8(c12_00,_fjsp_mul_v2r8(rinvsix,rinvsix));
+            vvdw             = _fjsp_msub_v2r8(_fjsp_nmsub_v2r8(c12_00,_fjsp_mul_v2r8(sh_vdw_invrcut6,sh_vdw_invrcut6),vvdw12),one_twelfth,
+                                           _fjsp_mul_v2r8(_fjsp_nmsub_v2r8( c6_00,sh_vdw_invrcut6,vvdw6),one_sixth));
+            fvdw             = _fjsp_mul_v2r8(_fjsp_sub_v2r8(vvdw12,vvdw6),rinvsq00);
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+            vvdw             = _fjsp_and_v2r8(vvdw,cutoff_mask);
+            vvdw             = _fjsp_unpacklo_v2r8(vvdw,_fjsp_setzero_v2r8());
+            vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
+
+            fscal            = _fjsp_add_v2r8(felec,fvdw);
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq01,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq01,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq01,rinv01),crf));
+            felec            = _fjsp_mul_v2r8(qq01,_fjsp_msub_v2r8(rinv01,rinvsq01,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq01,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx01,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy01,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz01,fscal,fiz0);
+            
+            fjx1             = _fjsp_madd_v2r8(dx01,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy01,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz01,fscal,fjz1);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq02,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq02,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq02,rinv02),crf));
+            felec            = _fjsp_mul_v2r8(qq02,_fjsp_msub_v2r8(rinv02,rinvsq02,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq02,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx02,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy02,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz02,fscal,fiz0);
+            
+            fjx2             = _fjsp_madd_v2r8(dx02,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy02,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz02,fscal,fjz2);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq10,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq10,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq10,rinv10),crf));
+            felec            = _fjsp_mul_v2r8(qq10,_fjsp_msub_v2r8(rinv10,rinvsq10,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq10,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq11,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq11,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq11,rinv11),crf));
+            felec            = _fjsp_mul_v2r8(qq11,_fjsp_msub_v2r8(rinv11,rinvsq11,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq11,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+            
+            fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq12,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq12,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq12,rinv12),crf));
+            felec            = _fjsp_mul_v2r8(qq12,_fjsp_msub_v2r8(rinv12,rinvsq12,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq12,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+            
+            fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq20,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq20,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq20,rinv20),crf));
+            felec            = _fjsp_mul_v2r8(qq20,_fjsp_msub_v2r8(rinv20,rinvsq20,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq20,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq21,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq21,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq21,rinv21),crf));
+            felec            = _fjsp_mul_v2r8(qq21,_fjsp_msub_v2r8(rinv21,rinvsq21,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq21,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+            
+            fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq22,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq22,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq22,rinv22),crf));
+            felec            = _fjsp_mul_v2r8(qq22,_fjsp_msub_v2r8(rinv22,rinvsq22,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq22,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+            
+            fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+
+            }
+
+            gmx_fjsp_decrement_3rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+
+            /* Inner loop uses 369 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_fjsp_update_iforce_3atom_swizzle_v2r8(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        ggid                        = gid[iidx];
+        /* Update potential energies */
+        gmx_fjsp_update_1pot_v2r8(velecsum,kernel_data->energygrp_elec+ggid);
+        gmx_fjsp_update_1pot_v2r8(vvdwsum,kernel_data->energygrp_vdw+ggid);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 20 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_VF,outeriter*20 + inneriter*369);
+}
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecRFCut_VdwLJSh_GeomW3W3_F_sparc64_hpc_ace_double
+ * Electrostatics interaction: ReactionField
+ * VdW interaction:            LennardJones
+ * Geometry:                   Water3-Water3
+ * Calculate force/pot:        Force
+ */
+void
+nb_kernel_ElecRFCut_VdwLJSh_GeomW3W3_F_sparc64_hpc_ace_double
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwjidx0A,vdwjidx0B;
+    _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    int              vdwjidx1A,vdwjidx1B;
+    _fjsp_v2r8       jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+    int              vdwjidx2A,vdwjidx2B;
+    _fjsp_v2r8       jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+    _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    _fjsp_v2r8       dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
+    _fjsp_v2r8       dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
+    _fjsp_v2r8       dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    _fjsp_v2r8       dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+    _fjsp_v2r8       dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+    _fjsp_v2r8       dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    _fjsp_v2r8       dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+    _fjsp_v2r8       dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+    _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
+    _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
+    _fjsp_v2r8       itab_tmp;
+    _fjsp_v2r8       dummy_mask,cutoff_mask;
+    _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+    _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+    union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    krf              = gmx_fjsp_set1_v2r8(fr->ic->k_rf);
+    krf2             = gmx_fjsp_set1_v2r8(fr->ic->k_rf*2.0);
+    crf              = gmx_fjsp_set1_v2r8(fr->ic->c_rf);
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq0              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+0]));
+    iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+    iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+    vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+    jq0              = gmx_fjsp_set1_v2r8(charge[inr+0]);
+    jq1              = gmx_fjsp_set1_v2r8(charge[inr+1]);
+    jq2              = gmx_fjsp_set1_v2r8(charge[inr+2]);
+    vdwjidx0A        = 2*vdwtype[inr+0];
+    qq00             = _fjsp_mul_v2r8(iq0,jq0);
+    c6_00            = gmx_fjsp_set1_v2r8(vdwparam[vdwioffset0+vdwjidx0A]);
+    c12_00           = gmx_fjsp_set1_v2r8(vdwparam[vdwioffset0+vdwjidx0A+1]);
+    qq01             = _fjsp_mul_v2r8(iq0,jq1);
+    qq02             = _fjsp_mul_v2r8(iq0,jq2);
+    qq10             = _fjsp_mul_v2r8(iq1,jq0);
+    qq11             = _fjsp_mul_v2r8(iq1,jq1);
+    qq12             = _fjsp_mul_v2r8(iq1,jq2);
+    qq20             = _fjsp_mul_v2r8(iq2,jq0);
+    qq21             = _fjsp_mul_v2r8(iq2,jq1);
+    qq22             = _fjsp_mul_v2r8(iq2,jq2);
+
+    /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+    rcutoff_scalar   = fr->rcoulomb;
+    rcutoff          = gmx_fjsp_set1_v2r8(rcutoff_scalar);
+    rcutoff2         = _fjsp_mul_v2r8(rcutoff,rcutoff);
+
+    sh_vdw_invrcut6  = gmx_fjsp_set1_v2r8(fr->ic->sh_invrc6);
+    rvdw             = gmx_fjsp_set1_v2r8(fr->rvdw);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_fjsp_load_shift_and_3rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
+
+        fix0             = _fjsp_setzero_v2r8();
+        fiy0             = _fjsp_setzero_v2r8();
+        fiz0             = _fjsp_setzero_v2r8();
+        fix1             = _fjsp_setzero_v2r8();
+        fiy1             = _fjsp_setzero_v2r8();
+        fiz1             = _fjsp_setzero_v2r8();
+        fix2             = _fjsp_setzero_v2r8();
+        fiy2             = _fjsp_setzero_v2r8();
+        fiz2             = _fjsp_setzero_v2r8();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_3rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx01             = _fjsp_sub_v2r8(ix0,jx1);
+            dy01             = _fjsp_sub_v2r8(iy0,jy1);
+            dz01             = _fjsp_sub_v2r8(iz0,jz1);
+            dx02             = _fjsp_sub_v2r8(ix0,jx2);
+            dy02             = _fjsp_sub_v2r8(iy0,jy2);
+            dz02             = _fjsp_sub_v2r8(iz0,jz2);
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx11             = _fjsp_sub_v2r8(ix1,jx1);
+            dy11             = _fjsp_sub_v2r8(iy1,jy1);
+            dz11             = _fjsp_sub_v2r8(iz1,jz1);
+            dx12             = _fjsp_sub_v2r8(ix1,jx2);
+            dy12             = _fjsp_sub_v2r8(iy1,jy2);
+            dz12             = _fjsp_sub_v2r8(iz1,jz2);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+            dx21             = _fjsp_sub_v2r8(ix2,jx1);
+            dy21             = _fjsp_sub_v2r8(iy2,jy1);
+            dz21             = _fjsp_sub_v2r8(iz2,jz1);
+            dx22             = _fjsp_sub_v2r8(ix2,jx2);
+            dy22             = _fjsp_sub_v2r8(iy2,jy2);
+            dz22             = _fjsp_sub_v2r8(iz2,jz2);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq01            = gmx_fjsp_calc_rsq_v2r8(dx01,dy01,dz01);
+            rsq02            = gmx_fjsp_calc_rsq_v2r8(dx02,dy02,dz02);
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+            rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+            rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+            rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv01           = gmx_fjsp_invsqrt_v2r8(rsq01);
+            rinv02           = gmx_fjsp_invsqrt_v2r8(rsq02);
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+            rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+            rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+            rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+            rinvsq01         = _fjsp_mul_v2r8(rinv01,rinv01);
+            rinvsq02         = _fjsp_mul_v2r8(rinv02,rinv02);
+            rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+            rinvsq11         = _fjsp_mul_v2r8(rinv11,rinv11);
+            rinvsq12         = _fjsp_mul_v2r8(rinv12,rinv12);
+            rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+            rinvsq21         = _fjsp_mul_v2r8(rinv21,rinv21);
+            rinvsq22         = _fjsp_mul_v2r8(rinv22,rinv22);
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+            fjx1             = _fjsp_setzero_v2r8();
+            fjy1             = _fjsp_setzero_v2r8();
+            fjz1             = _fjsp_setzero_v2r8();
+            fjx2             = _fjsp_setzero_v2r8();
+            fjy2             = _fjsp_setzero_v2r8();
+            fjz2             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq00,_fjsp_msub_v2r8(rinv00,rinvsq00,krf2));
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+            fvdw             = _fjsp_mul_v2r8(_fjsp_msub_v2r8(c12_00,rinvsix,c6_00),_fjsp_mul_v2r8(rinvsix,rinvsq00));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
+
+            fscal            = _fjsp_add_v2r8(felec,fvdw);
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq01,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq01,_fjsp_msub_v2r8(rinv01,rinvsq01,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq01,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx01,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy01,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz01,fscal,fiz0);
+            
+            fjx1             = _fjsp_madd_v2r8(dx01,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy01,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz01,fscal,fjz1);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq02,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq02,_fjsp_msub_v2r8(rinv02,rinvsq02,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq02,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx02,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy02,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz02,fscal,fiz0);
+            
+            fjx2             = _fjsp_madd_v2r8(dx02,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy02,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz02,fscal,fjz2);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq10,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq10,_fjsp_msub_v2r8(rinv10,rinvsq10,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq10,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq11,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq11,_fjsp_msub_v2r8(rinv11,rinvsq11,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq11,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+            
+            fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq12,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq12,_fjsp_msub_v2r8(rinv12,rinvsq12,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq12,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+            
+            fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq20,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq20,_fjsp_msub_v2r8(rinv20,rinvsq20,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq20,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq21,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq21,_fjsp_msub_v2r8(rinv21,rinvsq21,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq21,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+            
+            fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq22,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq22,_fjsp_msub_v2r8(rinv22,rinvsq22,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq22,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+            
+            fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+
+            }
+
+            gmx_fjsp_decrement_3rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+
+            /* Inner loop uses 304 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_3rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx01             = _fjsp_sub_v2r8(ix0,jx1);
+            dy01             = _fjsp_sub_v2r8(iy0,jy1);
+            dz01             = _fjsp_sub_v2r8(iz0,jz1);
+            dx02             = _fjsp_sub_v2r8(ix0,jx2);
+            dy02             = _fjsp_sub_v2r8(iy0,jy2);
+            dz02             = _fjsp_sub_v2r8(iz0,jz2);
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx11             = _fjsp_sub_v2r8(ix1,jx1);
+            dy11             = _fjsp_sub_v2r8(iy1,jy1);
+            dz11             = _fjsp_sub_v2r8(iz1,jz1);
+            dx12             = _fjsp_sub_v2r8(ix1,jx2);
+            dy12             = _fjsp_sub_v2r8(iy1,jy2);
+            dz12             = _fjsp_sub_v2r8(iz1,jz2);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+            dx21             = _fjsp_sub_v2r8(ix2,jx1);
+            dy21             = _fjsp_sub_v2r8(iy2,jy1);
+            dz21             = _fjsp_sub_v2r8(iz2,jz1);
+            dx22             = _fjsp_sub_v2r8(ix2,jx2);
+            dy22             = _fjsp_sub_v2r8(iy2,jy2);
+            dz22             = _fjsp_sub_v2r8(iz2,jz2);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq01            = gmx_fjsp_calc_rsq_v2r8(dx01,dy01,dz01);
+            rsq02            = gmx_fjsp_calc_rsq_v2r8(dx02,dy02,dz02);
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+            rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+            rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+            rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv01           = gmx_fjsp_invsqrt_v2r8(rsq01);
+            rinv02           = gmx_fjsp_invsqrt_v2r8(rsq02);
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+            rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+            rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+            rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+            rinvsq01         = _fjsp_mul_v2r8(rinv01,rinv01);
+            rinvsq02         = _fjsp_mul_v2r8(rinv02,rinv02);
+            rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+            rinvsq11         = _fjsp_mul_v2r8(rinv11,rinv11);
+            rinvsq12         = _fjsp_mul_v2r8(rinv12,rinv12);
+            rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+            rinvsq21         = _fjsp_mul_v2r8(rinv21,rinv21);
+            rinvsq22         = _fjsp_mul_v2r8(rinv22,rinv22);
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+            fjx1             = _fjsp_setzero_v2r8();
+            fjy1             = _fjsp_setzero_v2r8();
+            fjz1             = _fjsp_setzero_v2r8();
+            fjx2             = _fjsp_setzero_v2r8();
+            fjy2             = _fjsp_setzero_v2r8();
+            fjz2             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq00,_fjsp_msub_v2r8(rinv00,rinvsq00,krf2));
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+            fvdw             = _fjsp_mul_v2r8(_fjsp_msub_v2r8(c12_00,rinvsix,c6_00),_fjsp_mul_v2r8(rinvsix,rinvsq00));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
+
+            fscal            = _fjsp_add_v2r8(felec,fvdw);
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq01,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq01,_fjsp_msub_v2r8(rinv01,rinvsq01,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq01,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx01,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy01,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz01,fscal,fiz0);
+            
+            fjx1             = _fjsp_madd_v2r8(dx01,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy01,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz01,fscal,fjz1);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq02,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq02,_fjsp_msub_v2r8(rinv02,rinvsq02,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq02,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx02,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy02,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz02,fscal,fiz0);
+            
+            fjx2             = _fjsp_madd_v2r8(dx02,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy02,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz02,fscal,fjz2);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq10,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq10,_fjsp_msub_v2r8(rinv10,rinvsq10,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq10,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq11,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq11,_fjsp_msub_v2r8(rinv11,rinvsq11,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq11,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+            
+            fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq12,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq12,_fjsp_msub_v2r8(rinv12,rinvsq12,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq12,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+            
+            fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq20,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq20,_fjsp_msub_v2r8(rinv20,rinvsq20,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq20,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq21,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq21,_fjsp_msub_v2r8(rinv21,rinvsq21,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq21,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+            
+            fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq22,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq22,_fjsp_msub_v2r8(rinv22,rinvsq22,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq22,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+            
+            fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+
+            }
+
+            gmx_fjsp_decrement_3rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+
+            /* Inner loop uses 304 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_fjsp_update_iforce_3atom_swizzle_v2r8(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 18 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_F,outeriter*18 + inneriter*304);
+}
diff --git a/src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecRFCut_VdwLJSh_GeomW4P1_sparc64_hpc_ace_double.c b/src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecRFCut_VdwLJSh_GeomW4P1_sparc64_hpc_ace_double.c
new file mode 100644 (file)
index 0000000..c4597a8
--- /dev/null
@@ -0,0 +1,1133 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2012, by the GROMACS development team, led by
+ * David van der Spoel, Berk Hess, Erik Lindahl, and including many
+ * others, as listed in the AUTHORS file in the top-level source
+ * directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*
+ * Note: this file was generated by the GROMACS sparc64_hpc_ace_double kernel generator.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "kernelutil_sparc64_hpc_ace_double.h"
+
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecRFCut_VdwLJSh_GeomW4P1_VF_sparc64_hpc_ace_double
+ * Electrostatics interaction: ReactionField
+ * VdW interaction:            LennardJones
+ * Geometry:                   Water4-Particle
+ * Calculate force/pot:        PotentialAndForce
+ */
+void
+nb_kernel_ElecRFCut_VdwLJSh_GeomW4P1_VF_sparc64_hpc_ace_double
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwioffset3;
+    _fjsp_v2r8       ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+    int              vdwjidx0A,vdwjidx0B;
+    _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    _fjsp_v2r8       dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    _fjsp_v2r8       dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    _fjsp_v2r8       dx30,dy30,dz30,rsq30,rinv30,rinvsq30,r30,qq30,c6_30,c12_30;
+    _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
+    _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
+    _fjsp_v2r8       itab_tmp;
+    _fjsp_v2r8       dummy_mask,cutoff_mask;
+    _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+    _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+    union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    krf              = gmx_fjsp_set1_v2r8(fr->ic->k_rf);
+    krf2             = gmx_fjsp_set1_v2r8(fr->ic->k_rf*2.0);
+    crf              = gmx_fjsp_set1_v2r8(fr->ic->c_rf);
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+    iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+    iq3              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+3]));
+    vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+    /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+    rcutoff_scalar   = fr->rcoulomb;
+    rcutoff          = gmx_fjsp_set1_v2r8(rcutoff_scalar);
+    rcutoff2         = _fjsp_mul_v2r8(rcutoff,rcutoff);
+
+    sh_vdw_invrcut6  = gmx_fjsp_set1_v2r8(fr->ic->sh_invrc6);
+    rvdw             = gmx_fjsp_set1_v2r8(fr->rvdw);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_fjsp_load_shift_and_4rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
+
+        fix0             = _fjsp_setzero_v2r8();
+        fiy0             = _fjsp_setzero_v2r8();
+        fiz0             = _fjsp_setzero_v2r8();
+        fix1             = _fjsp_setzero_v2r8();
+        fiy1             = _fjsp_setzero_v2r8();
+        fiz1             = _fjsp_setzero_v2r8();
+        fix2             = _fjsp_setzero_v2r8();
+        fiy2             = _fjsp_setzero_v2r8();
+        fiz2             = _fjsp_setzero_v2r8();
+        fix3             = _fjsp_setzero_v2r8();
+        fiy3             = _fjsp_setzero_v2r8();
+        fiz3             = _fjsp_setzero_v2r8();
+
+        /* Reset potential sums */
+        velecsum         = _fjsp_setzero_v2r8();
+        vvdwsum          = _fjsp_setzero_v2r8();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+            dx30             = _fjsp_sub_v2r8(ix3,jx0);
+            dy30             = _fjsp_sub_v2r8(iy3,jy0);
+            dz30             = _fjsp_sub_v2r8(iz3,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+            rsq30            = gmx_fjsp_calc_rsq_v2r8(dx30,dy30,dz30);
+
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+            rinv30           = gmx_fjsp_invsqrt_v2r8(rsq30);
+
+            rinvsq00         = gmx_fjsp_inv_v2r8(rsq00);
+            rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+            rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+            rinvsq30         = _fjsp_mul_v2r8(rinv30,rinv30);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_fjsp_load_2real_swizzle_v2r8(charge+jnrA+0,charge+jnrB+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
+            {
+
+            /* Compute parameters for interactions between i and j atoms */
+            gmx_fjsp_load_2pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,&c6_00,&c12_00);
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+            vvdw6            = _fjsp_mul_v2r8(c6_00,rinvsix);
+            vvdw12           = _fjsp_mul_v2r8(c12_00,_fjsp_mul_v2r8(rinvsix,rinvsix));
+            vvdw             = _fjsp_msub_v2r8(_fjsp_nmsub_v2r8(c12_00,_fjsp_mul_v2r8(sh_vdw_invrcut6,sh_vdw_invrcut6),vvdw12),one_twelfth,
+                                           _fjsp_mul_v2r8(_fjsp_nmsub_v2r8( c6_00,sh_vdw_invrcut6,vvdw6),one_sixth));
+            fvdw             = _fjsp_mul_v2r8(_fjsp_sub_v2r8(vvdw12,vvdw6),rinvsq00);
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            vvdw             = _fjsp_and_v2r8(vvdw,cutoff_mask);
+            vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
+
+            fscal            = fvdw;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq10,rcutoff2))
+            {
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _fjsp_mul_v2r8(iq1,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq10,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq10,rinv10),crf));
+            felec            = _fjsp_mul_v2r8(qq10,_fjsp_msub_v2r8(rinv10,rinvsq10,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq10,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq20,rcutoff2))
+            {
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _fjsp_mul_v2r8(iq2,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq20,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq20,rinv20),crf));
+            felec            = _fjsp_mul_v2r8(qq20,_fjsp_msub_v2r8(rinv20,rinvsq20,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq20,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq30,rcutoff2))
+            {
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq30             = _fjsp_mul_v2r8(iq3,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq30,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq30,rinv30),crf));
+            felec            = _fjsp_mul_v2r8(qq30,_fjsp_msub_v2r8(rinv30,rinvsq30,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq30,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx30,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy30,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz30,fscal,fiz3);
+            
+            fjx0             = _fjsp_madd_v2r8(dx30,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy30,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz30,fscal,fjz0);
+
+            }
+
+            gmx_fjsp_decrement_1rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 164 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+            dx30             = _fjsp_sub_v2r8(ix3,jx0);
+            dy30             = _fjsp_sub_v2r8(iy3,jy0);
+            dz30             = _fjsp_sub_v2r8(iz3,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+            rsq30            = gmx_fjsp_calc_rsq_v2r8(dx30,dy30,dz30);
+
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+            rinv30           = gmx_fjsp_invsqrt_v2r8(rsq30);
+
+            rinvsq00         = gmx_fjsp_inv_v2r8(rsq00);
+            rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+            rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+            rinvsq30         = _fjsp_mul_v2r8(rinv30,rinv30);
+
+            /* Load parameters for j particles */
+            jq0              = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),charge+jnrA+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
+            {
+
+            /* Compute parameters for interactions between i and j atoms */
+            gmx_fjsp_load_1pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,&c6_00,&c12_00);
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+            vvdw6            = _fjsp_mul_v2r8(c6_00,rinvsix);
+            vvdw12           = _fjsp_mul_v2r8(c12_00,_fjsp_mul_v2r8(rinvsix,rinvsix));
+            vvdw             = _fjsp_msub_v2r8(_fjsp_nmsub_v2r8(c12_00,_fjsp_mul_v2r8(sh_vdw_invrcut6,sh_vdw_invrcut6),vvdw12),one_twelfth,
+                                           _fjsp_mul_v2r8(_fjsp_nmsub_v2r8( c6_00,sh_vdw_invrcut6,vvdw6),one_sixth));
+            fvdw             = _fjsp_mul_v2r8(_fjsp_sub_v2r8(vvdw12,vvdw6),rinvsq00);
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            vvdw             = _fjsp_and_v2r8(vvdw,cutoff_mask);
+            vvdw             = _fjsp_unpacklo_v2r8(vvdw,_fjsp_setzero_v2r8());
+            vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
+
+            fscal            = fvdw;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq10,rcutoff2))
+            {
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _fjsp_mul_v2r8(iq1,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq10,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq10,rinv10),crf));
+            felec            = _fjsp_mul_v2r8(qq10,_fjsp_msub_v2r8(rinv10,rinvsq10,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq10,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq20,rcutoff2))
+            {
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _fjsp_mul_v2r8(iq2,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq20,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq20,rinv20),crf));
+            felec            = _fjsp_mul_v2r8(qq20,_fjsp_msub_v2r8(rinv20,rinvsq20,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq20,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq30,rcutoff2))
+            {
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq30             = _fjsp_mul_v2r8(iq3,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq30,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq30,rinv30),crf));
+            felec            = _fjsp_mul_v2r8(qq30,_fjsp_msub_v2r8(rinv30,rinvsq30,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq30,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx30,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy30,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz30,fscal,fiz3);
+            
+            fjx0             = _fjsp_madd_v2r8(dx30,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy30,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz30,fscal,fjz0);
+
+            }
+
+            gmx_fjsp_decrement_1rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 164 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_fjsp_update_iforce_4atom_swizzle_v2r8(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        ggid                        = gid[iidx];
+        /* Update potential energies */
+        gmx_fjsp_update_1pot_v2r8(velecsum,kernel_data->energygrp_elec+ggid);
+        gmx_fjsp_update_1pot_v2r8(vvdwsum,kernel_data->energygrp_vdw+ggid);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 26 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4_VF,outeriter*26 + inneriter*164);
+}
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecRFCut_VdwLJSh_GeomW4P1_F_sparc64_hpc_ace_double
+ * Electrostatics interaction: ReactionField
+ * VdW interaction:            LennardJones
+ * Geometry:                   Water4-Particle
+ * Calculate force/pot:        Force
+ */
+void
+nb_kernel_ElecRFCut_VdwLJSh_GeomW4P1_F_sparc64_hpc_ace_double
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwioffset3;
+    _fjsp_v2r8       ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+    int              vdwjidx0A,vdwjidx0B;
+    _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    _fjsp_v2r8       dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    _fjsp_v2r8       dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    _fjsp_v2r8       dx30,dy30,dz30,rsq30,rinv30,rinvsq30,r30,qq30,c6_30,c12_30;
+    _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
+    _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
+    _fjsp_v2r8       itab_tmp;
+    _fjsp_v2r8       dummy_mask,cutoff_mask;
+    _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+    _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+    union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    krf              = gmx_fjsp_set1_v2r8(fr->ic->k_rf);
+    krf2             = gmx_fjsp_set1_v2r8(fr->ic->k_rf*2.0);
+    crf              = gmx_fjsp_set1_v2r8(fr->ic->c_rf);
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+    iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+    iq3              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+3]));
+    vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+    /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+    rcutoff_scalar   = fr->rcoulomb;
+    rcutoff          = gmx_fjsp_set1_v2r8(rcutoff_scalar);
+    rcutoff2         = _fjsp_mul_v2r8(rcutoff,rcutoff);
+
+    sh_vdw_invrcut6  = gmx_fjsp_set1_v2r8(fr->ic->sh_invrc6);
+    rvdw             = gmx_fjsp_set1_v2r8(fr->rvdw);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_fjsp_load_shift_and_4rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
+
+        fix0             = _fjsp_setzero_v2r8();
+        fiy0             = _fjsp_setzero_v2r8();
+        fiz0             = _fjsp_setzero_v2r8();
+        fix1             = _fjsp_setzero_v2r8();
+        fiy1             = _fjsp_setzero_v2r8();
+        fiz1             = _fjsp_setzero_v2r8();
+        fix2             = _fjsp_setzero_v2r8();
+        fiy2             = _fjsp_setzero_v2r8();
+        fiz2             = _fjsp_setzero_v2r8();
+        fix3             = _fjsp_setzero_v2r8();
+        fiy3             = _fjsp_setzero_v2r8();
+        fiz3             = _fjsp_setzero_v2r8();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+            dx30             = _fjsp_sub_v2r8(ix3,jx0);
+            dy30             = _fjsp_sub_v2r8(iy3,jy0);
+            dz30             = _fjsp_sub_v2r8(iz3,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+            rsq30            = gmx_fjsp_calc_rsq_v2r8(dx30,dy30,dz30);
+
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+            rinv30           = gmx_fjsp_invsqrt_v2r8(rsq30);
+
+            rinvsq00         = gmx_fjsp_inv_v2r8(rsq00);
+            rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+            rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+            rinvsq30         = _fjsp_mul_v2r8(rinv30,rinv30);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_fjsp_load_2real_swizzle_v2r8(charge+jnrA+0,charge+jnrB+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
+            {
+
+            /* Compute parameters for interactions between i and j atoms */
+            gmx_fjsp_load_2pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,&c6_00,&c12_00);
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+            fvdw             = _fjsp_mul_v2r8(_fjsp_msub_v2r8(c12_00,rinvsix,c6_00),_fjsp_mul_v2r8(rinvsix,rinvsq00));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
+
+            fscal            = fvdw;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq10,rcutoff2))
+            {
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _fjsp_mul_v2r8(iq1,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq10,_fjsp_msub_v2r8(rinv10,rinvsq10,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq10,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq20,rcutoff2))
+            {
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _fjsp_mul_v2r8(iq2,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq20,_fjsp_msub_v2r8(rinv20,rinvsq20,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq20,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq30,rcutoff2))
+            {
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq30             = _fjsp_mul_v2r8(iq3,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq30,_fjsp_msub_v2r8(rinv30,rinvsq30,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq30,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx30,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy30,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz30,fscal,fiz3);
+            
+            fjx0             = _fjsp_madd_v2r8(dx30,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy30,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz30,fscal,fjz0);
+
+            }
+
+            gmx_fjsp_decrement_1rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 135 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+            dx30             = _fjsp_sub_v2r8(ix3,jx0);
+            dy30             = _fjsp_sub_v2r8(iy3,jy0);
+            dz30             = _fjsp_sub_v2r8(iz3,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+            rsq30            = gmx_fjsp_calc_rsq_v2r8(dx30,dy30,dz30);
+
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+            rinv30           = gmx_fjsp_invsqrt_v2r8(rsq30);
+
+            rinvsq00         = gmx_fjsp_inv_v2r8(rsq00);
+            rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+            rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+            rinvsq30         = _fjsp_mul_v2r8(rinv30,rinv30);
+
+            /* Load parameters for j particles */
+            jq0              = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),charge+jnrA+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
+            {
+
+            /* Compute parameters for interactions between i and j atoms */
+            gmx_fjsp_load_1pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,&c6_00,&c12_00);
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+            fvdw             = _fjsp_mul_v2r8(_fjsp_msub_v2r8(c12_00,rinvsix,c6_00),_fjsp_mul_v2r8(rinvsix,rinvsq00));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
+
+            fscal            = fvdw;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq10,rcutoff2))
+            {
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _fjsp_mul_v2r8(iq1,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq10,_fjsp_msub_v2r8(rinv10,rinvsq10,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq10,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq20,rcutoff2))
+            {
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _fjsp_mul_v2r8(iq2,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq20,_fjsp_msub_v2r8(rinv20,rinvsq20,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq20,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq30,rcutoff2))
+            {
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq30             = _fjsp_mul_v2r8(iq3,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq30,_fjsp_msub_v2r8(rinv30,rinvsq30,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq30,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx30,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy30,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz30,fscal,fiz3);
+            
+            fjx0             = _fjsp_madd_v2r8(dx30,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy30,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz30,fscal,fjz0);
+
+            }
+
+            gmx_fjsp_decrement_1rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 135 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_fjsp_update_iforce_4atom_swizzle_v2r8(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 24 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4_F,outeriter*24 + inneriter*135);
+}
diff --git a/src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecRFCut_VdwLJSh_GeomW4W4_sparc64_hpc_ace_double.c b/src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecRFCut_VdwLJSh_GeomW4W4_sparc64_hpc_ace_double.c
new file mode 100644 (file)
index 0000000..4ebe12c
--- /dev/null
@@ -0,0 +1,2043 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2012, by the GROMACS development team, led by
+ * David van der Spoel, Berk Hess, Erik Lindahl, and including many
+ * others, as listed in the AUTHORS file in the top-level source
+ * directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*
+ * Note: this file was generated by the GROMACS sparc64_hpc_ace_double kernel generator.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "kernelutil_sparc64_hpc_ace_double.h"
+
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecRFCut_VdwLJSh_GeomW4W4_VF_sparc64_hpc_ace_double
+ * Electrostatics interaction: ReactionField
+ * VdW interaction:            LennardJones
+ * Geometry:                   Water4-Water4
+ * Calculate force/pot:        PotentialAndForce
+ */
+void
+nb_kernel_ElecRFCut_VdwLJSh_GeomW4W4_VF_sparc64_hpc_ace_double
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwioffset3;
+    _fjsp_v2r8       ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+    int              vdwjidx0A,vdwjidx0B;
+    _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    int              vdwjidx1A,vdwjidx1B;
+    _fjsp_v2r8       jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+    int              vdwjidx2A,vdwjidx2B;
+    _fjsp_v2r8       jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+    int              vdwjidx3A,vdwjidx3B;
+    _fjsp_v2r8       jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
+    _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    _fjsp_v2r8       dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+    _fjsp_v2r8       dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+    _fjsp_v2r8       dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
+    _fjsp_v2r8       dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+    _fjsp_v2r8       dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+    _fjsp_v2r8       dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
+    _fjsp_v2r8       dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
+    _fjsp_v2r8       dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
+    _fjsp_v2r8       dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
+    _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
+    _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
+    _fjsp_v2r8       itab_tmp;
+    _fjsp_v2r8       dummy_mask,cutoff_mask;
+    _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+    _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+    union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    krf              = gmx_fjsp_set1_v2r8(fr->ic->k_rf);
+    krf2             = gmx_fjsp_set1_v2r8(fr->ic->k_rf*2.0);
+    crf              = gmx_fjsp_set1_v2r8(fr->ic->c_rf);
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+    iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+    iq3              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+3]));
+    vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+    jq1              = gmx_fjsp_set1_v2r8(charge[inr+1]);
+    jq2              = gmx_fjsp_set1_v2r8(charge[inr+2]);
+    jq3              = gmx_fjsp_set1_v2r8(charge[inr+3]);
+    vdwjidx0A        = 2*vdwtype[inr+0];
+    c6_00            = gmx_fjsp_set1_v2r8(vdwparam[vdwioffset0+vdwjidx0A]);
+    c12_00           = gmx_fjsp_set1_v2r8(vdwparam[vdwioffset0+vdwjidx0A+1]);
+    qq11             = _fjsp_mul_v2r8(iq1,jq1);
+    qq12             = _fjsp_mul_v2r8(iq1,jq2);
+    qq13             = _fjsp_mul_v2r8(iq1,jq3);
+    qq21             = _fjsp_mul_v2r8(iq2,jq1);
+    qq22             = _fjsp_mul_v2r8(iq2,jq2);
+    qq23             = _fjsp_mul_v2r8(iq2,jq3);
+    qq31             = _fjsp_mul_v2r8(iq3,jq1);
+    qq32             = _fjsp_mul_v2r8(iq3,jq2);
+    qq33             = _fjsp_mul_v2r8(iq3,jq3);
+
+    /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+    rcutoff_scalar   = fr->rcoulomb;
+    rcutoff          = gmx_fjsp_set1_v2r8(rcutoff_scalar);
+    rcutoff2         = _fjsp_mul_v2r8(rcutoff,rcutoff);
+
+    sh_vdw_invrcut6  = gmx_fjsp_set1_v2r8(fr->ic->sh_invrc6);
+    rvdw             = gmx_fjsp_set1_v2r8(fr->rvdw);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_fjsp_load_shift_and_4rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
+
+        fix0             = _fjsp_setzero_v2r8();
+        fiy0             = _fjsp_setzero_v2r8();
+        fiz0             = _fjsp_setzero_v2r8();
+        fix1             = _fjsp_setzero_v2r8();
+        fiy1             = _fjsp_setzero_v2r8();
+        fiz1             = _fjsp_setzero_v2r8();
+        fix2             = _fjsp_setzero_v2r8();
+        fiy2             = _fjsp_setzero_v2r8();
+        fiz2             = _fjsp_setzero_v2r8();
+        fix3             = _fjsp_setzero_v2r8();
+        fiy3             = _fjsp_setzero_v2r8();
+        fiz3             = _fjsp_setzero_v2r8();
+
+        /* Reset potential sums */
+        velecsum         = _fjsp_setzero_v2r8();
+        vvdwsum          = _fjsp_setzero_v2r8();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_4rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
+                                              &jy2,&jz2,&jx3,&jy3,&jz3);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx11             = _fjsp_sub_v2r8(ix1,jx1);
+            dy11             = _fjsp_sub_v2r8(iy1,jy1);
+            dz11             = _fjsp_sub_v2r8(iz1,jz1);
+            dx12             = _fjsp_sub_v2r8(ix1,jx2);
+            dy12             = _fjsp_sub_v2r8(iy1,jy2);
+            dz12             = _fjsp_sub_v2r8(iz1,jz2);
+            dx13             = _fjsp_sub_v2r8(ix1,jx3);
+            dy13             = _fjsp_sub_v2r8(iy1,jy3);
+            dz13             = _fjsp_sub_v2r8(iz1,jz3);
+            dx21             = _fjsp_sub_v2r8(ix2,jx1);
+            dy21             = _fjsp_sub_v2r8(iy2,jy1);
+            dz21             = _fjsp_sub_v2r8(iz2,jz1);
+            dx22             = _fjsp_sub_v2r8(ix2,jx2);
+            dy22             = _fjsp_sub_v2r8(iy2,jy2);
+            dz22             = _fjsp_sub_v2r8(iz2,jz2);
+            dx23             = _fjsp_sub_v2r8(ix2,jx3);
+            dy23             = _fjsp_sub_v2r8(iy2,jy3);
+            dz23             = _fjsp_sub_v2r8(iz2,jz3);
+            dx31             = _fjsp_sub_v2r8(ix3,jx1);
+            dy31             = _fjsp_sub_v2r8(iy3,jy1);
+            dz31             = _fjsp_sub_v2r8(iz3,jz1);
+            dx32             = _fjsp_sub_v2r8(ix3,jx2);
+            dy32             = _fjsp_sub_v2r8(iy3,jy2);
+            dz32             = _fjsp_sub_v2r8(iz3,jz2);
+            dx33             = _fjsp_sub_v2r8(ix3,jx3);
+            dy33             = _fjsp_sub_v2r8(iy3,jy3);
+            dz33             = _fjsp_sub_v2r8(iz3,jz3);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+            rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+            rsq13            = gmx_fjsp_calc_rsq_v2r8(dx13,dy13,dz13);
+            rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+            rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+            rsq23            = gmx_fjsp_calc_rsq_v2r8(dx23,dy23,dz23);
+            rsq31            = gmx_fjsp_calc_rsq_v2r8(dx31,dy31,dz31);
+            rsq32            = gmx_fjsp_calc_rsq_v2r8(dx32,dy32,dz32);
+            rsq33            = gmx_fjsp_calc_rsq_v2r8(dx33,dy33,dz33);
+
+            rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+            rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+            rinv13           = gmx_fjsp_invsqrt_v2r8(rsq13);
+            rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+            rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+            rinv23           = gmx_fjsp_invsqrt_v2r8(rsq23);
+            rinv31           = gmx_fjsp_invsqrt_v2r8(rsq31);
+            rinv32           = gmx_fjsp_invsqrt_v2r8(rsq32);
+            rinv33           = gmx_fjsp_invsqrt_v2r8(rsq33);
+
+            rinvsq00         = gmx_fjsp_inv_v2r8(rsq00);
+            rinvsq11         = _fjsp_mul_v2r8(rinv11,rinv11);
+            rinvsq12         = _fjsp_mul_v2r8(rinv12,rinv12);
+            rinvsq13         = _fjsp_mul_v2r8(rinv13,rinv13);
+            rinvsq21         = _fjsp_mul_v2r8(rinv21,rinv21);
+            rinvsq22         = _fjsp_mul_v2r8(rinv22,rinv22);
+            rinvsq23         = _fjsp_mul_v2r8(rinv23,rinv23);
+            rinvsq31         = _fjsp_mul_v2r8(rinv31,rinv31);
+            rinvsq32         = _fjsp_mul_v2r8(rinv32,rinv32);
+            rinvsq33         = _fjsp_mul_v2r8(rinv33,rinv33);
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+            fjx1             = _fjsp_setzero_v2r8();
+            fjy1             = _fjsp_setzero_v2r8();
+            fjz1             = _fjsp_setzero_v2r8();
+            fjx2             = _fjsp_setzero_v2r8();
+            fjy2             = _fjsp_setzero_v2r8();
+            fjz2             = _fjsp_setzero_v2r8();
+            fjx3             = _fjsp_setzero_v2r8();
+            fjy3             = _fjsp_setzero_v2r8();
+            fjz3             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
+            {
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+            vvdw6            = _fjsp_mul_v2r8(c6_00,rinvsix);
+            vvdw12           = _fjsp_mul_v2r8(c12_00,_fjsp_mul_v2r8(rinvsix,rinvsix));
+            vvdw             = _fjsp_msub_v2r8(_fjsp_nmsub_v2r8(c12_00,_fjsp_mul_v2r8(sh_vdw_invrcut6,sh_vdw_invrcut6),vvdw12),one_twelfth,
+                                           _fjsp_mul_v2r8(_fjsp_nmsub_v2r8( c6_00,sh_vdw_invrcut6,vvdw6),one_sixth));
+            fvdw             = _fjsp_mul_v2r8(_fjsp_sub_v2r8(vvdw12,vvdw6),rinvsq00);
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            vvdw             = _fjsp_and_v2r8(vvdw,cutoff_mask);
+            vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
+
+            fscal            = fvdw;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq11,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq11,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq11,rinv11),crf));
+            felec            = _fjsp_mul_v2r8(qq11,_fjsp_msub_v2r8(rinv11,rinvsq11,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq11,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+            
+            fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq12,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq12,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq12,rinv12),crf));
+            felec            = _fjsp_mul_v2r8(qq12,_fjsp_msub_v2r8(rinv12,rinvsq12,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq12,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+            
+            fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq13,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq13,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq13,rinv13),crf));
+            felec            = _fjsp_mul_v2r8(qq13,_fjsp_msub_v2r8(rinv13,rinvsq13,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq13,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx13,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy13,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz13,fscal,fiz1);
+            
+            fjx3             = _fjsp_madd_v2r8(dx13,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy13,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz13,fscal,fjz3);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq21,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq21,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq21,rinv21),crf));
+            felec            = _fjsp_mul_v2r8(qq21,_fjsp_msub_v2r8(rinv21,rinvsq21,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq21,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+            
+            fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq22,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq22,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq22,rinv22),crf));
+            felec            = _fjsp_mul_v2r8(qq22,_fjsp_msub_v2r8(rinv22,rinvsq22,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq22,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+            
+            fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq23,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq23,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq23,rinv23),crf));
+            felec            = _fjsp_mul_v2r8(qq23,_fjsp_msub_v2r8(rinv23,rinvsq23,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq23,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx23,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy23,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz23,fscal,fiz2);
+            
+            fjx3             = _fjsp_madd_v2r8(dx23,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy23,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz23,fscal,fjz3);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq31,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq31,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq31,rinv31),crf));
+            felec            = _fjsp_mul_v2r8(qq31,_fjsp_msub_v2r8(rinv31,rinvsq31,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq31,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx31,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy31,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz31,fscal,fiz3);
+            
+            fjx1             = _fjsp_madd_v2r8(dx31,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy31,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz31,fscal,fjz1);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq32,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq32,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq32,rinv32),crf));
+            felec            = _fjsp_mul_v2r8(qq32,_fjsp_msub_v2r8(rinv32,rinvsq32,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq32,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx32,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy32,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz32,fscal,fiz3);
+            
+            fjx2             = _fjsp_madd_v2r8(dx32,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy32,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz32,fscal,fjz2);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq33,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq33,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq33,rinv33),crf));
+            felec            = _fjsp_mul_v2r8(qq33,_fjsp_msub_v2r8(rinv33,rinvsq33,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq33,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx33,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy33,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz33,fscal,fiz3);
+            
+            fjx3             = _fjsp_madd_v2r8(dx33,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy33,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz33,fscal,fjz3);
+
+            }
+
+            gmx_fjsp_decrement_4rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+
+            /* Inner loop uses 398 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_4rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
+                                              &jy2,&jz2,&jx3,&jy3,&jz3);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx11             = _fjsp_sub_v2r8(ix1,jx1);
+            dy11             = _fjsp_sub_v2r8(iy1,jy1);
+            dz11             = _fjsp_sub_v2r8(iz1,jz1);
+            dx12             = _fjsp_sub_v2r8(ix1,jx2);
+            dy12             = _fjsp_sub_v2r8(iy1,jy2);
+            dz12             = _fjsp_sub_v2r8(iz1,jz2);
+            dx13             = _fjsp_sub_v2r8(ix1,jx3);
+            dy13             = _fjsp_sub_v2r8(iy1,jy3);
+            dz13             = _fjsp_sub_v2r8(iz1,jz3);
+            dx21             = _fjsp_sub_v2r8(ix2,jx1);
+            dy21             = _fjsp_sub_v2r8(iy2,jy1);
+            dz21             = _fjsp_sub_v2r8(iz2,jz1);
+            dx22             = _fjsp_sub_v2r8(ix2,jx2);
+            dy22             = _fjsp_sub_v2r8(iy2,jy2);
+            dz22             = _fjsp_sub_v2r8(iz2,jz2);
+            dx23             = _fjsp_sub_v2r8(ix2,jx3);
+            dy23             = _fjsp_sub_v2r8(iy2,jy3);
+            dz23             = _fjsp_sub_v2r8(iz2,jz3);
+            dx31             = _fjsp_sub_v2r8(ix3,jx1);
+            dy31             = _fjsp_sub_v2r8(iy3,jy1);
+            dz31             = _fjsp_sub_v2r8(iz3,jz1);
+            dx32             = _fjsp_sub_v2r8(ix3,jx2);
+            dy32             = _fjsp_sub_v2r8(iy3,jy2);
+            dz32             = _fjsp_sub_v2r8(iz3,jz2);
+            dx33             = _fjsp_sub_v2r8(ix3,jx3);
+            dy33             = _fjsp_sub_v2r8(iy3,jy3);
+            dz33             = _fjsp_sub_v2r8(iz3,jz3);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+            rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+            rsq13            = gmx_fjsp_calc_rsq_v2r8(dx13,dy13,dz13);
+            rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+            rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+            rsq23            = gmx_fjsp_calc_rsq_v2r8(dx23,dy23,dz23);
+            rsq31            = gmx_fjsp_calc_rsq_v2r8(dx31,dy31,dz31);
+            rsq32            = gmx_fjsp_calc_rsq_v2r8(dx32,dy32,dz32);
+            rsq33            = gmx_fjsp_calc_rsq_v2r8(dx33,dy33,dz33);
+
+            rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+            rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+            rinv13           = gmx_fjsp_invsqrt_v2r8(rsq13);
+            rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+            rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+            rinv23           = gmx_fjsp_invsqrt_v2r8(rsq23);
+            rinv31           = gmx_fjsp_invsqrt_v2r8(rsq31);
+            rinv32           = gmx_fjsp_invsqrt_v2r8(rsq32);
+            rinv33           = gmx_fjsp_invsqrt_v2r8(rsq33);
+
+            rinvsq00         = gmx_fjsp_inv_v2r8(rsq00);
+            rinvsq11         = _fjsp_mul_v2r8(rinv11,rinv11);
+            rinvsq12         = _fjsp_mul_v2r8(rinv12,rinv12);
+            rinvsq13         = _fjsp_mul_v2r8(rinv13,rinv13);
+            rinvsq21         = _fjsp_mul_v2r8(rinv21,rinv21);
+            rinvsq22         = _fjsp_mul_v2r8(rinv22,rinv22);
+            rinvsq23         = _fjsp_mul_v2r8(rinv23,rinv23);
+            rinvsq31         = _fjsp_mul_v2r8(rinv31,rinv31);
+            rinvsq32         = _fjsp_mul_v2r8(rinv32,rinv32);
+            rinvsq33         = _fjsp_mul_v2r8(rinv33,rinv33);
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+            fjx1             = _fjsp_setzero_v2r8();
+            fjy1             = _fjsp_setzero_v2r8();
+            fjz1             = _fjsp_setzero_v2r8();
+            fjx2             = _fjsp_setzero_v2r8();
+            fjy2             = _fjsp_setzero_v2r8();
+            fjz2             = _fjsp_setzero_v2r8();
+            fjx3             = _fjsp_setzero_v2r8();
+            fjy3             = _fjsp_setzero_v2r8();
+            fjz3             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
+            {
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+            vvdw6            = _fjsp_mul_v2r8(c6_00,rinvsix);
+            vvdw12           = _fjsp_mul_v2r8(c12_00,_fjsp_mul_v2r8(rinvsix,rinvsix));
+            vvdw             = _fjsp_msub_v2r8(_fjsp_nmsub_v2r8(c12_00,_fjsp_mul_v2r8(sh_vdw_invrcut6,sh_vdw_invrcut6),vvdw12),one_twelfth,
+                                           _fjsp_mul_v2r8(_fjsp_nmsub_v2r8( c6_00,sh_vdw_invrcut6,vvdw6),one_sixth));
+            fvdw             = _fjsp_mul_v2r8(_fjsp_sub_v2r8(vvdw12,vvdw6),rinvsq00);
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            vvdw             = _fjsp_and_v2r8(vvdw,cutoff_mask);
+            vvdw             = _fjsp_unpacklo_v2r8(vvdw,_fjsp_setzero_v2r8());
+            vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
+
+            fscal            = fvdw;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq11,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq11,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq11,rinv11),crf));
+            felec            = _fjsp_mul_v2r8(qq11,_fjsp_msub_v2r8(rinv11,rinvsq11,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq11,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+            
+            fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq12,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq12,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq12,rinv12),crf));
+            felec            = _fjsp_mul_v2r8(qq12,_fjsp_msub_v2r8(rinv12,rinvsq12,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq12,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+            
+            fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq13,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq13,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq13,rinv13),crf));
+            felec            = _fjsp_mul_v2r8(qq13,_fjsp_msub_v2r8(rinv13,rinvsq13,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq13,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx13,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy13,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz13,fscal,fiz1);
+            
+            fjx3             = _fjsp_madd_v2r8(dx13,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy13,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz13,fscal,fjz3);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq21,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq21,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq21,rinv21),crf));
+            felec            = _fjsp_mul_v2r8(qq21,_fjsp_msub_v2r8(rinv21,rinvsq21,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq21,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+            
+            fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq22,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq22,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq22,rinv22),crf));
+            felec            = _fjsp_mul_v2r8(qq22,_fjsp_msub_v2r8(rinv22,rinvsq22,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq22,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+            
+            fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq23,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq23,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq23,rinv23),crf));
+            felec            = _fjsp_mul_v2r8(qq23,_fjsp_msub_v2r8(rinv23,rinvsq23,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq23,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx23,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy23,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz23,fscal,fiz2);
+            
+            fjx3             = _fjsp_madd_v2r8(dx23,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy23,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz23,fscal,fjz3);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq31,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq31,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq31,rinv31),crf));
+            felec            = _fjsp_mul_v2r8(qq31,_fjsp_msub_v2r8(rinv31,rinvsq31,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq31,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx31,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy31,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz31,fscal,fiz3);
+            
+            fjx1             = _fjsp_madd_v2r8(dx31,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy31,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz31,fscal,fjz1);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq32,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq32,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq32,rinv32),crf));
+            felec            = _fjsp_mul_v2r8(qq32,_fjsp_msub_v2r8(rinv32,rinvsq32,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq32,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx32,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy32,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz32,fscal,fiz3);
+            
+            fjx2             = _fjsp_madd_v2r8(dx32,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy32,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz32,fscal,fjz2);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq33,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq33,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq33,rinv33),crf));
+            felec            = _fjsp_mul_v2r8(qq33,_fjsp_msub_v2r8(rinv33,rinvsq33,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq33,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx33,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy33,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz33,fscal,fiz3);
+            
+            fjx3             = _fjsp_madd_v2r8(dx33,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy33,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz33,fscal,fjz3);
+
+            }
+
+            gmx_fjsp_decrement_4rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+
+            /* Inner loop uses 398 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_fjsp_update_iforce_4atom_swizzle_v2r8(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        ggid                        = gid[iidx];
+        /* Update potential energies */
+        gmx_fjsp_update_1pot_v2r8(velecsum,kernel_data->energygrp_elec+ggid);
+        gmx_fjsp_update_1pot_v2r8(vvdwsum,kernel_data->energygrp_vdw+ggid);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 26 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_VF,outeriter*26 + inneriter*398);
+}
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecRFCut_VdwLJSh_GeomW4W4_F_sparc64_hpc_ace_double
+ * Electrostatics interaction: ReactionField
+ * VdW interaction:            LennardJones
+ * Geometry:                   Water4-Water4
+ * Calculate force/pot:        Force
+ */
+void
+nb_kernel_ElecRFCut_VdwLJSh_GeomW4W4_F_sparc64_hpc_ace_double
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwioffset3;
+    _fjsp_v2r8       ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+    int              vdwjidx0A,vdwjidx0B;
+    _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    int              vdwjidx1A,vdwjidx1B;
+    _fjsp_v2r8       jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+    int              vdwjidx2A,vdwjidx2B;
+    _fjsp_v2r8       jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+    int              vdwjidx3A,vdwjidx3B;
+    _fjsp_v2r8       jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
+    _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    _fjsp_v2r8       dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+    _fjsp_v2r8       dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+    _fjsp_v2r8       dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
+    _fjsp_v2r8       dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+    _fjsp_v2r8       dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+    _fjsp_v2r8       dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
+    _fjsp_v2r8       dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
+    _fjsp_v2r8       dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
+    _fjsp_v2r8       dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
+    _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
+    _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
+    _fjsp_v2r8       itab_tmp;
+    _fjsp_v2r8       dummy_mask,cutoff_mask;
+    _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+    _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+    union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    krf              = gmx_fjsp_set1_v2r8(fr->ic->k_rf);
+    krf2             = gmx_fjsp_set1_v2r8(fr->ic->k_rf*2.0);
+    crf              = gmx_fjsp_set1_v2r8(fr->ic->c_rf);
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+    iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+    iq3              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+3]));
+    vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+    jq1              = gmx_fjsp_set1_v2r8(charge[inr+1]);
+    jq2              = gmx_fjsp_set1_v2r8(charge[inr+2]);
+    jq3              = gmx_fjsp_set1_v2r8(charge[inr+3]);
+    vdwjidx0A        = 2*vdwtype[inr+0];
+    c6_00            = gmx_fjsp_set1_v2r8(vdwparam[vdwioffset0+vdwjidx0A]);
+    c12_00           = gmx_fjsp_set1_v2r8(vdwparam[vdwioffset0+vdwjidx0A+1]);
+    qq11             = _fjsp_mul_v2r8(iq1,jq1);
+    qq12             = _fjsp_mul_v2r8(iq1,jq2);
+    qq13             = _fjsp_mul_v2r8(iq1,jq3);
+    qq21             = _fjsp_mul_v2r8(iq2,jq1);
+    qq22             = _fjsp_mul_v2r8(iq2,jq2);
+    qq23             = _fjsp_mul_v2r8(iq2,jq3);
+    qq31             = _fjsp_mul_v2r8(iq3,jq1);
+    qq32             = _fjsp_mul_v2r8(iq3,jq2);
+    qq33             = _fjsp_mul_v2r8(iq3,jq3);
+
+    /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+    rcutoff_scalar   = fr->rcoulomb;
+    rcutoff          = gmx_fjsp_set1_v2r8(rcutoff_scalar);
+    rcutoff2         = _fjsp_mul_v2r8(rcutoff,rcutoff);
+
+    sh_vdw_invrcut6  = gmx_fjsp_set1_v2r8(fr->ic->sh_invrc6);
+    rvdw             = gmx_fjsp_set1_v2r8(fr->rvdw);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_fjsp_load_shift_and_4rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
+
+        fix0             = _fjsp_setzero_v2r8();
+        fiy0             = _fjsp_setzero_v2r8();
+        fiz0             = _fjsp_setzero_v2r8();
+        fix1             = _fjsp_setzero_v2r8();
+        fiy1             = _fjsp_setzero_v2r8();
+        fiz1             = _fjsp_setzero_v2r8();
+        fix2             = _fjsp_setzero_v2r8();
+        fiy2             = _fjsp_setzero_v2r8();
+        fiz2             = _fjsp_setzero_v2r8();
+        fix3             = _fjsp_setzero_v2r8();
+        fiy3             = _fjsp_setzero_v2r8();
+        fiz3             = _fjsp_setzero_v2r8();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_4rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
+                                              &jy2,&jz2,&jx3,&jy3,&jz3);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx11             = _fjsp_sub_v2r8(ix1,jx1);
+            dy11             = _fjsp_sub_v2r8(iy1,jy1);
+            dz11             = _fjsp_sub_v2r8(iz1,jz1);
+            dx12             = _fjsp_sub_v2r8(ix1,jx2);
+            dy12             = _fjsp_sub_v2r8(iy1,jy2);
+            dz12             = _fjsp_sub_v2r8(iz1,jz2);
+            dx13             = _fjsp_sub_v2r8(ix1,jx3);
+            dy13             = _fjsp_sub_v2r8(iy1,jy3);
+            dz13             = _fjsp_sub_v2r8(iz1,jz3);
+            dx21             = _fjsp_sub_v2r8(ix2,jx1);
+            dy21             = _fjsp_sub_v2r8(iy2,jy1);
+            dz21             = _fjsp_sub_v2r8(iz2,jz1);
+            dx22             = _fjsp_sub_v2r8(ix2,jx2);
+            dy22             = _fjsp_sub_v2r8(iy2,jy2);
+            dz22             = _fjsp_sub_v2r8(iz2,jz2);
+            dx23             = _fjsp_sub_v2r8(ix2,jx3);
+            dy23             = _fjsp_sub_v2r8(iy2,jy3);
+            dz23             = _fjsp_sub_v2r8(iz2,jz3);
+            dx31             = _fjsp_sub_v2r8(ix3,jx1);
+            dy31             = _fjsp_sub_v2r8(iy3,jy1);
+            dz31             = _fjsp_sub_v2r8(iz3,jz1);
+            dx32             = _fjsp_sub_v2r8(ix3,jx2);
+            dy32             = _fjsp_sub_v2r8(iy3,jy2);
+            dz32             = _fjsp_sub_v2r8(iz3,jz2);
+            dx33             = _fjsp_sub_v2r8(ix3,jx3);
+            dy33             = _fjsp_sub_v2r8(iy3,jy3);
+            dz33             = _fjsp_sub_v2r8(iz3,jz3);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+            rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+            rsq13            = gmx_fjsp_calc_rsq_v2r8(dx13,dy13,dz13);
+            rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+            rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+            rsq23            = gmx_fjsp_calc_rsq_v2r8(dx23,dy23,dz23);
+            rsq31            = gmx_fjsp_calc_rsq_v2r8(dx31,dy31,dz31);
+            rsq32            = gmx_fjsp_calc_rsq_v2r8(dx32,dy32,dz32);
+            rsq33            = gmx_fjsp_calc_rsq_v2r8(dx33,dy33,dz33);
+
+            rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+            rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+            rinv13           = gmx_fjsp_invsqrt_v2r8(rsq13);
+            rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+            rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+            rinv23           = gmx_fjsp_invsqrt_v2r8(rsq23);
+            rinv31           = gmx_fjsp_invsqrt_v2r8(rsq31);
+            rinv32           = gmx_fjsp_invsqrt_v2r8(rsq32);
+            rinv33           = gmx_fjsp_invsqrt_v2r8(rsq33);
+
+            rinvsq00         = gmx_fjsp_inv_v2r8(rsq00);
+            rinvsq11         = _fjsp_mul_v2r8(rinv11,rinv11);
+            rinvsq12         = _fjsp_mul_v2r8(rinv12,rinv12);
+            rinvsq13         = _fjsp_mul_v2r8(rinv13,rinv13);
+            rinvsq21         = _fjsp_mul_v2r8(rinv21,rinv21);
+            rinvsq22         = _fjsp_mul_v2r8(rinv22,rinv22);
+            rinvsq23         = _fjsp_mul_v2r8(rinv23,rinv23);
+            rinvsq31         = _fjsp_mul_v2r8(rinv31,rinv31);
+            rinvsq32         = _fjsp_mul_v2r8(rinv32,rinv32);
+            rinvsq33         = _fjsp_mul_v2r8(rinv33,rinv33);
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+            fjx1             = _fjsp_setzero_v2r8();
+            fjy1             = _fjsp_setzero_v2r8();
+            fjz1             = _fjsp_setzero_v2r8();
+            fjx2             = _fjsp_setzero_v2r8();
+            fjy2             = _fjsp_setzero_v2r8();
+            fjz2             = _fjsp_setzero_v2r8();
+            fjx3             = _fjsp_setzero_v2r8();
+            fjy3             = _fjsp_setzero_v2r8();
+            fjz3             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
+            {
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+            fvdw             = _fjsp_mul_v2r8(_fjsp_msub_v2r8(c12_00,rinvsix,c6_00),_fjsp_mul_v2r8(rinvsix,rinvsq00));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
+
+            fscal            = fvdw;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq11,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq11,_fjsp_msub_v2r8(rinv11,rinvsq11,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq11,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+            
+            fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq12,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq12,_fjsp_msub_v2r8(rinv12,rinvsq12,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq12,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+            
+            fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq13,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq13,_fjsp_msub_v2r8(rinv13,rinvsq13,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq13,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx13,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy13,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz13,fscal,fiz1);
+            
+            fjx3             = _fjsp_madd_v2r8(dx13,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy13,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz13,fscal,fjz3);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq21,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq21,_fjsp_msub_v2r8(rinv21,rinvsq21,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq21,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+            
+            fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq22,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq22,_fjsp_msub_v2r8(rinv22,rinvsq22,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq22,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+            
+            fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq23,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq23,_fjsp_msub_v2r8(rinv23,rinvsq23,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq23,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx23,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy23,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz23,fscal,fiz2);
+            
+            fjx3             = _fjsp_madd_v2r8(dx23,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy23,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz23,fscal,fjz3);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq31,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq31,_fjsp_msub_v2r8(rinv31,rinvsq31,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq31,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx31,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy31,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz31,fscal,fiz3);
+            
+            fjx1             = _fjsp_madd_v2r8(dx31,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy31,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz31,fscal,fjz1);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq32,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq32,_fjsp_msub_v2r8(rinv32,rinvsq32,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq32,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx32,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy32,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz32,fscal,fiz3);
+            
+            fjx2             = _fjsp_madd_v2r8(dx32,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy32,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz32,fscal,fjz2);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq33,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq33,_fjsp_msub_v2r8(rinv33,rinvsq33,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq33,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx33,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy33,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz33,fscal,fiz3);
+            
+            fjx3             = _fjsp_madd_v2r8(dx33,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy33,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz33,fscal,fjz3);
+
+            }
+
+            gmx_fjsp_decrement_4rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+
+            /* Inner loop uses 333 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_4rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
+                                              &jy2,&jz2,&jx3,&jy3,&jz3);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx11             = _fjsp_sub_v2r8(ix1,jx1);
+            dy11             = _fjsp_sub_v2r8(iy1,jy1);
+            dz11             = _fjsp_sub_v2r8(iz1,jz1);
+            dx12             = _fjsp_sub_v2r8(ix1,jx2);
+            dy12             = _fjsp_sub_v2r8(iy1,jy2);
+            dz12             = _fjsp_sub_v2r8(iz1,jz2);
+            dx13             = _fjsp_sub_v2r8(ix1,jx3);
+            dy13             = _fjsp_sub_v2r8(iy1,jy3);
+            dz13             = _fjsp_sub_v2r8(iz1,jz3);
+            dx21             = _fjsp_sub_v2r8(ix2,jx1);
+            dy21             = _fjsp_sub_v2r8(iy2,jy1);
+            dz21             = _fjsp_sub_v2r8(iz2,jz1);
+            dx22             = _fjsp_sub_v2r8(ix2,jx2);
+            dy22             = _fjsp_sub_v2r8(iy2,jy2);
+            dz22             = _fjsp_sub_v2r8(iz2,jz2);
+            dx23             = _fjsp_sub_v2r8(ix2,jx3);
+            dy23             = _fjsp_sub_v2r8(iy2,jy3);
+            dz23             = _fjsp_sub_v2r8(iz2,jz3);
+            dx31             = _fjsp_sub_v2r8(ix3,jx1);
+            dy31             = _fjsp_sub_v2r8(iy3,jy1);
+            dz31             = _fjsp_sub_v2r8(iz3,jz1);
+            dx32             = _fjsp_sub_v2r8(ix3,jx2);
+            dy32             = _fjsp_sub_v2r8(iy3,jy2);
+            dz32             = _fjsp_sub_v2r8(iz3,jz2);
+            dx33             = _fjsp_sub_v2r8(ix3,jx3);
+            dy33             = _fjsp_sub_v2r8(iy3,jy3);
+            dz33             = _fjsp_sub_v2r8(iz3,jz3);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+            rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+            rsq13            = gmx_fjsp_calc_rsq_v2r8(dx13,dy13,dz13);
+            rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+            rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+            rsq23            = gmx_fjsp_calc_rsq_v2r8(dx23,dy23,dz23);
+            rsq31            = gmx_fjsp_calc_rsq_v2r8(dx31,dy31,dz31);
+            rsq32            = gmx_fjsp_calc_rsq_v2r8(dx32,dy32,dz32);
+            rsq33            = gmx_fjsp_calc_rsq_v2r8(dx33,dy33,dz33);
+
+            rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+            rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+            rinv13           = gmx_fjsp_invsqrt_v2r8(rsq13);
+            rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+            rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+            rinv23           = gmx_fjsp_invsqrt_v2r8(rsq23);
+            rinv31           = gmx_fjsp_invsqrt_v2r8(rsq31);
+            rinv32           = gmx_fjsp_invsqrt_v2r8(rsq32);
+            rinv33           = gmx_fjsp_invsqrt_v2r8(rsq33);
+
+            rinvsq00         = gmx_fjsp_inv_v2r8(rsq00);
+            rinvsq11         = _fjsp_mul_v2r8(rinv11,rinv11);
+            rinvsq12         = _fjsp_mul_v2r8(rinv12,rinv12);
+            rinvsq13         = _fjsp_mul_v2r8(rinv13,rinv13);
+            rinvsq21         = _fjsp_mul_v2r8(rinv21,rinv21);
+            rinvsq22         = _fjsp_mul_v2r8(rinv22,rinv22);
+            rinvsq23         = _fjsp_mul_v2r8(rinv23,rinv23);
+            rinvsq31         = _fjsp_mul_v2r8(rinv31,rinv31);
+            rinvsq32         = _fjsp_mul_v2r8(rinv32,rinv32);
+            rinvsq33         = _fjsp_mul_v2r8(rinv33,rinv33);
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+            fjx1             = _fjsp_setzero_v2r8();
+            fjy1             = _fjsp_setzero_v2r8();
+            fjz1             = _fjsp_setzero_v2r8();
+            fjx2             = _fjsp_setzero_v2r8();
+            fjy2             = _fjsp_setzero_v2r8();
+            fjz2             = _fjsp_setzero_v2r8();
+            fjx3             = _fjsp_setzero_v2r8();
+            fjy3             = _fjsp_setzero_v2r8();
+            fjz3             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
+            {
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+            fvdw             = _fjsp_mul_v2r8(_fjsp_msub_v2r8(c12_00,rinvsix,c6_00),_fjsp_mul_v2r8(rinvsix,rinvsq00));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
+
+            fscal            = fvdw;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq11,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq11,_fjsp_msub_v2r8(rinv11,rinvsq11,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq11,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+            
+            fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq12,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq12,_fjsp_msub_v2r8(rinv12,rinvsq12,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq12,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+            
+            fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq13,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq13,_fjsp_msub_v2r8(rinv13,rinvsq13,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq13,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx13,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy13,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz13,fscal,fiz1);
+            
+            fjx3             = _fjsp_madd_v2r8(dx13,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy13,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz13,fscal,fjz3);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq21,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq21,_fjsp_msub_v2r8(rinv21,rinvsq21,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq21,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+            
+            fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq22,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq22,_fjsp_msub_v2r8(rinv22,rinvsq22,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq22,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+            
+            fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq23,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq23,_fjsp_msub_v2r8(rinv23,rinvsq23,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq23,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx23,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy23,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz23,fscal,fiz2);
+            
+            fjx3             = _fjsp_madd_v2r8(dx23,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy23,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz23,fscal,fjz3);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq31,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq31,_fjsp_msub_v2r8(rinv31,rinvsq31,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq31,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx31,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy31,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz31,fscal,fiz3);
+            
+            fjx1             = _fjsp_madd_v2r8(dx31,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy31,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz31,fscal,fjz1);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq32,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq32,_fjsp_msub_v2r8(rinv32,rinvsq32,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq32,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx32,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy32,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz32,fscal,fiz3);
+            
+            fjx2             = _fjsp_madd_v2r8(dx32,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy32,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz32,fscal,fjz2);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq33,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq33,_fjsp_msub_v2r8(rinv33,rinvsq33,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq33,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx33,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy33,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz33,fscal,fiz3);
+            
+            fjx3             = _fjsp_madd_v2r8(dx33,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy33,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz33,fscal,fjz3);
+
+            }
+
+            gmx_fjsp_decrement_4rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+
+            /* Inner loop uses 333 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_fjsp_update_iforce_4atom_swizzle_v2r8(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 24 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_F,outeriter*24 + inneriter*333);
+}
diff --git a/src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecRFCut_VdwLJSw_GeomP1P1_sparc64_hpc_ace_double.c b/src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecRFCut_VdwLJSw_GeomP1P1_sparc64_hpc_ace_double.c
new file mode 100644 (file)
index 0000000..dd908d3
--- /dev/null
@@ -0,0 +1,683 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2012, by the GROMACS development team, led by
+ * David van der Spoel, Berk Hess, Erik Lindahl, and including many
+ * others, as listed in the AUTHORS file in the top-level source
+ * directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*
+ * Note: this file was generated by the GROMACS sparc64_hpc_ace_double kernel generator.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "kernelutil_sparc64_hpc_ace_double.h"
+
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecRFCut_VdwLJSw_GeomP1P1_VF_sparc64_hpc_ace_double
+ * Electrostatics interaction: ReactionField
+ * VdW interaction:            LennardJones
+ * Geometry:                   Particle-Particle
+ * Calculate force/pot:        PotentialAndForce
+ */
+void
+nb_kernel_ElecRFCut_VdwLJSw_GeomP1P1_VF_sparc64_hpc_ace_double
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwjidx0A,vdwjidx0B;
+    _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
+    _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
+    _fjsp_v2r8       rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
+    real             rswitch_scalar,d_scalar;
+    _fjsp_v2r8       itab_tmp;
+    _fjsp_v2r8       dummy_mask,cutoff_mask;
+    _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+    _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+    union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    krf              = gmx_fjsp_set1_v2r8(fr->ic->k_rf);
+    krf2             = gmx_fjsp_set1_v2r8(fr->ic->k_rf*2.0);
+    crf              = gmx_fjsp_set1_v2r8(fr->ic->c_rf);
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+
+    /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+    rcutoff_scalar   = fr->rcoulomb;
+    rcutoff          = gmx_fjsp_set1_v2r8(rcutoff_scalar);
+    rcutoff2         = _fjsp_mul_v2r8(rcutoff,rcutoff);
+
+    rswitch_scalar   = fr->rvdw_switch;
+    rswitch          = gmx_fjsp_set1_v2r8(rswitch_scalar);
+    /* Setup switch parameters */
+    d_scalar         = rcutoff_scalar-rswitch_scalar;
+    d                = gmx_fjsp_set1_v2r8(d_scalar);
+    swV3             = gmx_fjsp_set1_v2r8(-10.0/(d_scalar*d_scalar*d_scalar));
+    swV4             = gmx_fjsp_set1_v2r8( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar));
+    swV5             = gmx_fjsp_set1_v2r8( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
+    swF2             = gmx_fjsp_set1_v2r8(-30.0/(d_scalar*d_scalar*d_scalar));
+    swF3             = gmx_fjsp_set1_v2r8( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar));
+    swF4             = gmx_fjsp_set1_v2r8(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_fjsp_load_shift_and_1rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,&ix0,&iy0,&iz0);
+
+        fix0             = _fjsp_setzero_v2r8();
+        fiy0             = _fjsp_setzero_v2r8();
+        fiz0             = _fjsp_setzero_v2r8();
+
+        /* Load parameters for i particles */
+        iq0              = _fjsp_mul_v2r8(facel,gmx_fjsp_load1_v2r8(charge+inr+0));
+        vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+        /* Reset potential sums */
+        velecsum         = _fjsp_setzero_v2r8();
+        vvdwsum          = _fjsp_setzero_v2r8();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_fjsp_load_2real_swizzle_v2r8(charge+jnrA+0,charge+jnrB+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
+            {
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _fjsp_mul_v2r8(iq0,jq0);
+            gmx_fjsp_load_2pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,&c6_00,&c12_00);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq00,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq00,rinv00),crf));
+            felec            = _fjsp_mul_v2r8(qq00,_fjsp_msub_v2r8(rinv00,rinvsq00,krf2));
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+            vvdw6            = _fjsp_mul_v2r8(c6_00,rinvsix);
+            vvdw12           = _fjsp_mul_v2r8(c12_00,_fjsp_mul_v2r8(rinvsix,rinvsix));
+            vvdw             = _fjsp_msub_v2r8( vvdw12,one_twelfth, _fjsp_mul_v2r8(vvdw6,one_sixth) );
+            fvdw             = _fjsp_mul_v2r8(_fjsp_sub_v2r8(vvdw12,vvdw6),rinvsq00);
+
+            d                = _fjsp_sub_v2r8(r00,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            fvdw             = _fjsp_msub_v2r8( fvdw,sw , _fjsp_mul_v2r8(rinv00,_fjsp_mul_v2r8(vvdw,dsw)) );
+            vvdw             = _fjsp_mul_v2r8(vvdw,sw);
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+            vvdw             = _fjsp_and_v2r8(vvdw,cutoff_mask);
+            vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
+
+            fscal            = _fjsp_add_v2r8(felec,fvdw);
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            gmx_fjsp_decrement_fma_1rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fscal,dx00,dy00,dz00);
+
+            }
+
+            /* Inner loop uses 73 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+
+            /* Load parameters for j particles */
+            jq0              = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),charge+jnrA+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
+            {
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _fjsp_mul_v2r8(iq0,jq0);
+            gmx_fjsp_load_1pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,&c6_00,&c12_00);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq00,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq00,rinv00),crf));
+            felec            = _fjsp_mul_v2r8(qq00,_fjsp_msub_v2r8(rinv00,rinvsq00,krf2));
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+            vvdw6            = _fjsp_mul_v2r8(c6_00,rinvsix);
+            vvdw12           = _fjsp_mul_v2r8(c12_00,_fjsp_mul_v2r8(rinvsix,rinvsix));
+            vvdw             = _fjsp_msub_v2r8( vvdw12,one_twelfth, _fjsp_mul_v2r8(vvdw6,one_sixth) );
+            fvdw             = _fjsp_mul_v2r8(_fjsp_sub_v2r8(vvdw12,vvdw6),rinvsq00);
+
+            d                = _fjsp_sub_v2r8(r00,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            fvdw             = _fjsp_msub_v2r8( fvdw,sw , _fjsp_mul_v2r8(rinv00,_fjsp_mul_v2r8(vvdw,dsw)) );
+            vvdw             = _fjsp_mul_v2r8(vvdw,sw);
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+            vvdw             = _fjsp_and_v2r8(vvdw,cutoff_mask);
+            vvdw             = _fjsp_unpacklo_v2r8(vvdw,_fjsp_setzero_v2r8());
+            vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
+
+            fscal            = _fjsp_add_v2r8(felec,fvdw);
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            gmx_fjsp_decrement_fma_1rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fscal,dx00,dy00,dz00);
+
+            }
+
+            /* Inner loop uses 73 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_fjsp_update_iforce_1atom_swizzle_v2r8(fix0,fiy0,fiz0,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        ggid                        = gid[iidx];
+        /* Update potential energies */
+        gmx_fjsp_update_1pot_v2r8(velecsum,kernel_data->energygrp_elec+ggid);
+        gmx_fjsp_update_1pot_v2r8(vvdwsum,kernel_data->energygrp_vdw+ggid);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 9 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_VF,outeriter*9 + inneriter*73);
+}
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecRFCut_VdwLJSw_GeomP1P1_F_sparc64_hpc_ace_double
+ * Electrostatics interaction: ReactionField
+ * VdW interaction:            LennardJones
+ * Geometry:                   Particle-Particle
+ * Calculate force/pot:        Force
+ */
+void
+nb_kernel_ElecRFCut_VdwLJSw_GeomP1P1_F_sparc64_hpc_ace_double
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwjidx0A,vdwjidx0B;
+    _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
+    _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
+    _fjsp_v2r8       rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
+    real             rswitch_scalar,d_scalar;
+    _fjsp_v2r8       itab_tmp;
+    _fjsp_v2r8       dummy_mask,cutoff_mask;
+    _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+    _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+    union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    krf              = gmx_fjsp_set1_v2r8(fr->ic->k_rf);
+    krf2             = gmx_fjsp_set1_v2r8(fr->ic->k_rf*2.0);
+    crf              = gmx_fjsp_set1_v2r8(fr->ic->c_rf);
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+
+    /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+    rcutoff_scalar   = fr->rcoulomb;
+    rcutoff          = gmx_fjsp_set1_v2r8(rcutoff_scalar);
+    rcutoff2         = _fjsp_mul_v2r8(rcutoff,rcutoff);
+
+    rswitch_scalar   = fr->rvdw_switch;
+    rswitch          = gmx_fjsp_set1_v2r8(rswitch_scalar);
+    /* Setup switch parameters */
+    d_scalar         = rcutoff_scalar-rswitch_scalar;
+    d                = gmx_fjsp_set1_v2r8(d_scalar);
+    swV3             = gmx_fjsp_set1_v2r8(-10.0/(d_scalar*d_scalar*d_scalar));
+    swV4             = gmx_fjsp_set1_v2r8( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar));
+    swV5             = gmx_fjsp_set1_v2r8( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
+    swF2             = gmx_fjsp_set1_v2r8(-30.0/(d_scalar*d_scalar*d_scalar));
+    swF3             = gmx_fjsp_set1_v2r8( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar));
+    swF4             = gmx_fjsp_set1_v2r8(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_fjsp_load_shift_and_1rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,&ix0,&iy0,&iz0);
+
+        fix0             = _fjsp_setzero_v2r8();
+        fiy0             = _fjsp_setzero_v2r8();
+        fiz0             = _fjsp_setzero_v2r8();
+
+        /* Load parameters for i particles */
+        iq0              = _fjsp_mul_v2r8(facel,gmx_fjsp_load1_v2r8(charge+inr+0));
+        vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_fjsp_load_2real_swizzle_v2r8(charge+jnrA+0,charge+jnrB+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
+            {
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _fjsp_mul_v2r8(iq0,jq0);
+            gmx_fjsp_load_2pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,&c6_00,&c12_00);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq00,_fjsp_msub_v2r8(rinv00,rinvsq00,krf2));
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+            vvdw6            = _fjsp_mul_v2r8(c6_00,rinvsix);
+            vvdw12           = _fjsp_mul_v2r8(c12_00,_fjsp_mul_v2r8(rinvsix,rinvsix));
+            vvdw             = _fjsp_msub_v2r8( vvdw12,one_twelfth, _fjsp_mul_v2r8(vvdw6,one_sixth) );
+            fvdw             = _fjsp_mul_v2r8(_fjsp_sub_v2r8(vvdw12,vvdw6),rinvsq00);
+
+            d                = _fjsp_sub_v2r8(r00,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            fvdw             = _fjsp_msub_v2r8( fvdw,sw , _fjsp_mul_v2r8(rinv00,_fjsp_mul_v2r8(vvdw,dsw)) );
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
+
+            fscal            = _fjsp_add_v2r8(felec,fvdw);
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            gmx_fjsp_decrement_fma_1rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fscal,dx00,dy00,dz00);
+
+            }
+
+            /* Inner loop uses 64 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+
+            /* Load parameters for j particles */
+            jq0              = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),charge+jnrA+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
+            {
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _fjsp_mul_v2r8(iq0,jq0);
+            gmx_fjsp_load_1pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,&c6_00,&c12_00);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq00,_fjsp_msub_v2r8(rinv00,rinvsq00,krf2));
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+            vvdw6            = _fjsp_mul_v2r8(c6_00,rinvsix);
+            vvdw12           = _fjsp_mul_v2r8(c12_00,_fjsp_mul_v2r8(rinvsix,rinvsix));
+            vvdw             = _fjsp_msub_v2r8( vvdw12,one_twelfth, _fjsp_mul_v2r8(vvdw6,one_sixth) );
+            fvdw             = _fjsp_mul_v2r8(_fjsp_sub_v2r8(vvdw12,vvdw6),rinvsq00);
+
+            d                = _fjsp_sub_v2r8(r00,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            fvdw             = _fjsp_msub_v2r8( fvdw,sw , _fjsp_mul_v2r8(rinv00,_fjsp_mul_v2r8(vvdw,dsw)) );
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
+
+            fscal            = _fjsp_add_v2r8(felec,fvdw);
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            gmx_fjsp_decrement_fma_1rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fscal,dx00,dy00,dz00);
+
+            }
+
+            /* Inner loop uses 64 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_fjsp_update_iforce_1atom_swizzle_v2r8(fix0,fiy0,fiz0,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 7 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_F,outeriter*7 + inneriter*64);
+}
diff --git a/src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecRFCut_VdwLJSw_GeomW3P1_sparc64_hpc_ace_double.c b/src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecRFCut_VdwLJSw_GeomW3P1_sparc64_hpc_ace_double.c
new file mode 100644 (file)
index 0000000..36c26d1
--- /dev/null
@@ -0,0 +1,1065 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2012, by the GROMACS development team, led by
+ * David van der Spoel, Berk Hess, Erik Lindahl, and including many
+ * others, as listed in the AUTHORS file in the top-level source
+ * directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*
+ * Note: this file was generated by the GROMACS sparc64_hpc_ace_double kernel generator.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "kernelutil_sparc64_hpc_ace_double.h"
+
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecRFCut_VdwLJSw_GeomW3P1_VF_sparc64_hpc_ace_double
+ * Electrostatics interaction: ReactionField
+ * VdW interaction:            LennardJones
+ * Geometry:                   Water3-Particle
+ * Calculate force/pot:        PotentialAndForce
+ */
+void
+nb_kernel_ElecRFCut_VdwLJSw_GeomW3P1_VF_sparc64_hpc_ace_double
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwjidx0A,vdwjidx0B;
+    _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    _fjsp_v2r8       dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    _fjsp_v2r8       dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
+    _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
+    _fjsp_v2r8       rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
+    real             rswitch_scalar,d_scalar;
+    _fjsp_v2r8       itab_tmp;
+    _fjsp_v2r8       dummy_mask,cutoff_mask;
+    _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+    _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+    union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    krf              = gmx_fjsp_set1_v2r8(fr->ic->k_rf);
+    krf2             = gmx_fjsp_set1_v2r8(fr->ic->k_rf*2.0);
+    crf              = gmx_fjsp_set1_v2r8(fr->ic->c_rf);
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq0              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+0]));
+    iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+    iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+    vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+    /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+    rcutoff_scalar   = fr->rcoulomb;
+    rcutoff          = gmx_fjsp_set1_v2r8(rcutoff_scalar);
+    rcutoff2         = _fjsp_mul_v2r8(rcutoff,rcutoff);
+
+    rswitch_scalar   = fr->rvdw_switch;
+    rswitch          = gmx_fjsp_set1_v2r8(rswitch_scalar);
+    /* Setup switch parameters */
+    d_scalar         = rcutoff_scalar-rswitch_scalar;
+    d                = gmx_fjsp_set1_v2r8(d_scalar);
+    swV3             = gmx_fjsp_set1_v2r8(-10.0/(d_scalar*d_scalar*d_scalar));
+    swV4             = gmx_fjsp_set1_v2r8( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar));
+    swV5             = gmx_fjsp_set1_v2r8( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
+    swF2             = gmx_fjsp_set1_v2r8(-30.0/(d_scalar*d_scalar*d_scalar));
+    swF3             = gmx_fjsp_set1_v2r8( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar));
+    swF4             = gmx_fjsp_set1_v2r8(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_fjsp_load_shift_and_3rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
+
+        fix0             = _fjsp_setzero_v2r8();
+        fiy0             = _fjsp_setzero_v2r8();
+        fiz0             = _fjsp_setzero_v2r8();
+        fix1             = _fjsp_setzero_v2r8();
+        fiy1             = _fjsp_setzero_v2r8();
+        fiz1             = _fjsp_setzero_v2r8();
+        fix2             = _fjsp_setzero_v2r8();
+        fiy2             = _fjsp_setzero_v2r8();
+        fiz2             = _fjsp_setzero_v2r8();
+
+        /* Reset potential sums */
+        velecsum         = _fjsp_setzero_v2r8();
+        vvdwsum          = _fjsp_setzero_v2r8();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+            rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+            rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_fjsp_load_2real_swizzle_v2r8(charge+jnrA+0,charge+jnrB+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
+            {
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _fjsp_mul_v2r8(iq0,jq0);
+            gmx_fjsp_load_2pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,&c6_00,&c12_00);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq00,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq00,rinv00),crf));
+            felec            = _fjsp_mul_v2r8(qq00,_fjsp_msub_v2r8(rinv00,rinvsq00,krf2));
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+            vvdw6            = _fjsp_mul_v2r8(c6_00,rinvsix);
+            vvdw12           = _fjsp_mul_v2r8(c12_00,_fjsp_mul_v2r8(rinvsix,rinvsix));
+            vvdw             = _fjsp_msub_v2r8( vvdw12,one_twelfth, _fjsp_mul_v2r8(vvdw6,one_sixth) );
+            fvdw             = _fjsp_mul_v2r8(_fjsp_sub_v2r8(vvdw12,vvdw6),rinvsq00);
+
+            d                = _fjsp_sub_v2r8(r00,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            fvdw             = _fjsp_msub_v2r8( fvdw,sw , _fjsp_mul_v2r8(rinv00,_fjsp_mul_v2r8(vvdw,dsw)) );
+            vvdw             = _fjsp_mul_v2r8(vvdw,sw);
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+            vvdw             = _fjsp_and_v2r8(vvdw,cutoff_mask);
+            vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
+
+            fscal            = _fjsp_add_v2r8(felec,fvdw);
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq10,rcutoff2))
+            {
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _fjsp_mul_v2r8(iq1,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq10,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq10,rinv10),crf));
+            felec            = _fjsp_mul_v2r8(qq10,_fjsp_msub_v2r8(rinv10,rinvsq10,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq10,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq20,rcutoff2))
+            {
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _fjsp_mul_v2r8(iq2,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq20,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq20,rinv20),crf));
+            felec            = _fjsp_mul_v2r8(qq20,_fjsp_msub_v2r8(rinv20,rinvsq20,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq20,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            }
+
+            gmx_fjsp_decrement_1rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 154 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+            rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+            rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+
+            /* Load parameters for j particles */
+            jq0              = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),charge+jnrA+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
+            {
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _fjsp_mul_v2r8(iq0,jq0);
+            gmx_fjsp_load_1pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,&c6_00,&c12_00);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq00,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq00,rinv00),crf));
+            felec            = _fjsp_mul_v2r8(qq00,_fjsp_msub_v2r8(rinv00,rinvsq00,krf2));
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+            vvdw6            = _fjsp_mul_v2r8(c6_00,rinvsix);
+            vvdw12           = _fjsp_mul_v2r8(c12_00,_fjsp_mul_v2r8(rinvsix,rinvsix));
+            vvdw             = _fjsp_msub_v2r8( vvdw12,one_twelfth, _fjsp_mul_v2r8(vvdw6,one_sixth) );
+            fvdw             = _fjsp_mul_v2r8(_fjsp_sub_v2r8(vvdw12,vvdw6),rinvsq00);
+
+            d                = _fjsp_sub_v2r8(r00,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            fvdw             = _fjsp_msub_v2r8( fvdw,sw , _fjsp_mul_v2r8(rinv00,_fjsp_mul_v2r8(vvdw,dsw)) );
+            vvdw             = _fjsp_mul_v2r8(vvdw,sw);
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+            vvdw             = _fjsp_and_v2r8(vvdw,cutoff_mask);
+            vvdw             = _fjsp_unpacklo_v2r8(vvdw,_fjsp_setzero_v2r8());
+            vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
+
+            fscal            = _fjsp_add_v2r8(felec,fvdw);
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq10,rcutoff2))
+            {
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _fjsp_mul_v2r8(iq1,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq10,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq10,rinv10),crf));
+            felec            = _fjsp_mul_v2r8(qq10,_fjsp_msub_v2r8(rinv10,rinvsq10,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq10,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq20,rcutoff2))
+            {
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _fjsp_mul_v2r8(iq2,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq20,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq20,rinv20),crf));
+            felec            = _fjsp_mul_v2r8(qq20,_fjsp_msub_v2r8(rinv20,rinvsq20,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq20,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            }
+
+            gmx_fjsp_decrement_1rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 154 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_fjsp_update_iforce_3atom_swizzle_v2r8(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        ggid                        = gid[iidx];
+        /* Update potential energies */
+        gmx_fjsp_update_1pot_v2r8(velecsum,kernel_data->energygrp_elec+ggid);
+        gmx_fjsp_update_1pot_v2r8(vvdwsum,kernel_data->energygrp_vdw+ggid);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 20 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3_VF,outeriter*20 + inneriter*154);
+}
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecRFCut_VdwLJSw_GeomW3P1_F_sparc64_hpc_ace_double
+ * Electrostatics interaction: ReactionField
+ * VdW interaction:            LennardJones
+ * Geometry:                   Water3-Particle
+ * Calculate force/pot:        Force
+ */
+void
+nb_kernel_ElecRFCut_VdwLJSw_GeomW3P1_F_sparc64_hpc_ace_double
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwjidx0A,vdwjidx0B;
+    _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    _fjsp_v2r8       dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    _fjsp_v2r8       dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
+    _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
+    _fjsp_v2r8       rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
+    real             rswitch_scalar,d_scalar;
+    _fjsp_v2r8       itab_tmp;
+    _fjsp_v2r8       dummy_mask,cutoff_mask;
+    _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+    _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+    union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    krf              = gmx_fjsp_set1_v2r8(fr->ic->k_rf);
+    krf2             = gmx_fjsp_set1_v2r8(fr->ic->k_rf*2.0);
+    crf              = gmx_fjsp_set1_v2r8(fr->ic->c_rf);
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq0              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+0]));
+    iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+    iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+    vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+    /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+    rcutoff_scalar   = fr->rcoulomb;
+    rcutoff          = gmx_fjsp_set1_v2r8(rcutoff_scalar);
+    rcutoff2         = _fjsp_mul_v2r8(rcutoff,rcutoff);
+
+    rswitch_scalar   = fr->rvdw_switch;
+    rswitch          = gmx_fjsp_set1_v2r8(rswitch_scalar);
+    /* Setup switch parameters */
+    d_scalar         = rcutoff_scalar-rswitch_scalar;
+    d                = gmx_fjsp_set1_v2r8(d_scalar);
+    swV3             = gmx_fjsp_set1_v2r8(-10.0/(d_scalar*d_scalar*d_scalar));
+    swV4             = gmx_fjsp_set1_v2r8( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar));
+    swV5             = gmx_fjsp_set1_v2r8( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
+    swF2             = gmx_fjsp_set1_v2r8(-30.0/(d_scalar*d_scalar*d_scalar));
+    swF3             = gmx_fjsp_set1_v2r8( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar));
+    swF4             = gmx_fjsp_set1_v2r8(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_fjsp_load_shift_and_3rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
+
+        fix0             = _fjsp_setzero_v2r8();
+        fiy0             = _fjsp_setzero_v2r8();
+        fiz0             = _fjsp_setzero_v2r8();
+        fix1             = _fjsp_setzero_v2r8();
+        fiy1             = _fjsp_setzero_v2r8();
+        fiz1             = _fjsp_setzero_v2r8();
+        fix2             = _fjsp_setzero_v2r8();
+        fiy2             = _fjsp_setzero_v2r8();
+        fiz2             = _fjsp_setzero_v2r8();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+            rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+            rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_fjsp_load_2real_swizzle_v2r8(charge+jnrA+0,charge+jnrB+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
+            {
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _fjsp_mul_v2r8(iq0,jq0);
+            gmx_fjsp_load_2pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,&c6_00,&c12_00);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq00,_fjsp_msub_v2r8(rinv00,rinvsq00,krf2));
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+            vvdw6            = _fjsp_mul_v2r8(c6_00,rinvsix);
+            vvdw12           = _fjsp_mul_v2r8(c12_00,_fjsp_mul_v2r8(rinvsix,rinvsix));
+            vvdw             = _fjsp_msub_v2r8( vvdw12,one_twelfth, _fjsp_mul_v2r8(vvdw6,one_sixth) );
+            fvdw             = _fjsp_mul_v2r8(_fjsp_sub_v2r8(vvdw12,vvdw6),rinvsq00);
+
+            d                = _fjsp_sub_v2r8(r00,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            fvdw             = _fjsp_msub_v2r8( fvdw,sw , _fjsp_mul_v2r8(rinv00,_fjsp_mul_v2r8(vvdw,dsw)) );
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
+
+            fscal            = _fjsp_add_v2r8(felec,fvdw);
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq10,rcutoff2))
+            {
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _fjsp_mul_v2r8(iq1,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq10,_fjsp_msub_v2r8(rinv10,rinvsq10,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq10,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq20,rcutoff2))
+            {
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _fjsp_mul_v2r8(iq2,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq20,_fjsp_msub_v2r8(rinv20,rinvsq20,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq20,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            }
+
+            gmx_fjsp_decrement_1rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 133 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+            rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+            rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+
+            /* Load parameters for j particles */
+            jq0              = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),charge+jnrA+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
+            {
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _fjsp_mul_v2r8(iq0,jq0);
+            gmx_fjsp_load_1pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,&c6_00,&c12_00);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq00,_fjsp_msub_v2r8(rinv00,rinvsq00,krf2));
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+            vvdw6            = _fjsp_mul_v2r8(c6_00,rinvsix);
+            vvdw12           = _fjsp_mul_v2r8(c12_00,_fjsp_mul_v2r8(rinvsix,rinvsix));
+            vvdw             = _fjsp_msub_v2r8( vvdw12,one_twelfth, _fjsp_mul_v2r8(vvdw6,one_sixth) );
+            fvdw             = _fjsp_mul_v2r8(_fjsp_sub_v2r8(vvdw12,vvdw6),rinvsq00);
+
+            d                = _fjsp_sub_v2r8(r00,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            fvdw             = _fjsp_msub_v2r8( fvdw,sw , _fjsp_mul_v2r8(rinv00,_fjsp_mul_v2r8(vvdw,dsw)) );
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
+
+            fscal            = _fjsp_add_v2r8(felec,fvdw);
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq10,rcutoff2))
+            {
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _fjsp_mul_v2r8(iq1,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq10,_fjsp_msub_v2r8(rinv10,rinvsq10,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq10,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq20,rcutoff2))
+            {
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _fjsp_mul_v2r8(iq2,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq20,_fjsp_msub_v2r8(rinv20,rinvsq20,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq20,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            }
+
+            gmx_fjsp_decrement_1rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 133 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_fjsp_update_iforce_3atom_swizzle_v2r8(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 18 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3_F,outeriter*18 + inneriter*133);
+}
diff --git a/src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecRFCut_VdwLJSw_GeomW3W3_sparc64_hpc_ace_double.c b/src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecRFCut_VdwLJSw_GeomW3W3_sparc64_hpc_ace_double.c
new file mode 100644 (file)
index 0000000..9a66a46
--- /dev/null
@@ -0,0 +1,1963 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2012, by the GROMACS development team, led by
+ * David van der Spoel, Berk Hess, Erik Lindahl, and including many
+ * others, as listed in the AUTHORS file in the top-level source
+ * directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*
+ * Note: this file was generated by the GROMACS sparc64_hpc_ace_double kernel generator.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "kernelutil_sparc64_hpc_ace_double.h"
+
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecRFCut_VdwLJSw_GeomW3W3_VF_sparc64_hpc_ace_double
+ * Electrostatics interaction: ReactionField
+ * VdW interaction:            LennardJones
+ * Geometry:                   Water3-Water3
+ * Calculate force/pot:        PotentialAndForce
+ */
+void
+nb_kernel_ElecRFCut_VdwLJSw_GeomW3W3_VF_sparc64_hpc_ace_double
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwjidx0A,vdwjidx0B;
+    _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    int              vdwjidx1A,vdwjidx1B;
+    _fjsp_v2r8       jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+    int              vdwjidx2A,vdwjidx2B;
+    _fjsp_v2r8       jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+    _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    _fjsp_v2r8       dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
+    _fjsp_v2r8       dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
+    _fjsp_v2r8       dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    _fjsp_v2r8       dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+    _fjsp_v2r8       dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+    _fjsp_v2r8       dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    _fjsp_v2r8       dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+    _fjsp_v2r8       dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+    _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
+    _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
+    _fjsp_v2r8       rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
+    real             rswitch_scalar,d_scalar;
+    _fjsp_v2r8       itab_tmp;
+    _fjsp_v2r8       dummy_mask,cutoff_mask;
+    _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+    _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+    union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    krf              = gmx_fjsp_set1_v2r8(fr->ic->k_rf);
+    krf2             = gmx_fjsp_set1_v2r8(fr->ic->k_rf*2.0);
+    crf              = gmx_fjsp_set1_v2r8(fr->ic->c_rf);
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq0              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+0]));
+    iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+    iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+    vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+    jq0              = gmx_fjsp_set1_v2r8(charge[inr+0]);
+    jq1              = gmx_fjsp_set1_v2r8(charge[inr+1]);
+    jq2              = gmx_fjsp_set1_v2r8(charge[inr+2]);
+    vdwjidx0A        = 2*vdwtype[inr+0];
+    qq00             = _fjsp_mul_v2r8(iq0,jq0);
+    c6_00            = gmx_fjsp_set1_v2r8(vdwparam[vdwioffset0+vdwjidx0A]);
+    c12_00           = gmx_fjsp_set1_v2r8(vdwparam[vdwioffset0+vdwjidx0A+1]);
+    qq01             = _fjsp_mul_v2r8(iq0,jq1);
+    qq02             = _fjsp_mul_v2r8(iq0,jq2);
+    qq10             = _fjsp_mul_v2r8(iq1,jq0);
+    qq11             = _fjsp_mul_v2r8(iq1,jq1);
+    qq12             = _fjsp_mul_v2r8(iq1,jq2);
+    qq20             = _fjsp_mul_v2r8(iq2,jq0);
+    qq21             = _fjsp_mul_v2r8(iq2,jq1);
+    qq22             = _fjsp_mul_v2r8(iq2,jq2);
+
+    /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+    rcutoff_scalar   = fr->rcoulomb;
+    rcutoff          = gmx_fjsp_set1_v2r8(rcutoff_scalar);
+    rcutoff2         = _fjsp_mul_v2r8(rcutoff,rcutoff);
+
+    rswitch_scalar   = fr->rvdw_switch;
+    rswitch          = gmx_fjsp_set1_v2r8(rswitch_scalar);
+    /* Setup switch parameters */
+    d_scalar         = rcutoff_scalar-rswitch_scalar;
+    d                = gmx_fjsp_set1_v2r8(d_scalar);
+    swV3             = gmx_fjsp_set1_v2r8(-10.0/(d_scalar*d_scalar*d_scalar));
+    swV4             = gmx_fjsp_set1_v2r8( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar));
+    swV5             = gmx_fjsp_set1_v2r8( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
+    swF2             = gmx_fjsp_set1_v2r8(-30.0/(d_scalar*d_scalar*d_scalar));
+    swF3             = gmx_fjsp_set1_v2r8( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar));
+    swF4             = gmx_fjsp_set1_v2r8(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_fjsp_load_shift_and_3rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
+
+        fix0             = _fjsp_setzero_v2r8();
+        fiy0             = _fjsp_setzero_v2r8();
+        fiz0             = _fjsp_setzero_v2r8();
+        fix1             = _fjsp_setzero_v2r8();
+        fiy1             = _fjsp_setzero_v2r8();
+        fiz1             = _fjsp_setzero_v2r8();
+        fix2             = _fjsp_setzero_v2r8();
+        fiy2             = _fjsp_setzero_v2r8();
+        fiz2             = _fjsp_setzero_v2r8();
+
+        /* Reset potential sums */
+        velecsum         = _fjsp_setzero_v2r8();
+        vvdwsum          = _fjsp_setzero_v2r8();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_3rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx01             = _fjsp_sub_v2r8(ix0,jx1);
+            dy01             = _fjsp_sub_v2r8(iy0,jy1);
+            dz01             = _fjsp_sub_v2r8(iz0,jz1);
+            dx02             = _fjsp_sub_v2r8(ix0,jx2);
+            dy02             = _fjsp_sub_v2r8(iy0,jy2);
+            dz02             = _fjsp_sub_v2r8(iz0,jz2);
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx11             = _fjsp_sub_v2r8(ix1,jx1);
+            dy11             = _fjsp_sub_v2r8(iy1,jy1);
+            dz11             = _fjsp_sub_v2r8(iz1,jz1);
+            dx12             = _fjsp_sub_v2r8(ix1,jx2);
+            dy12             = _fjsp_sub_v2r8(iy1,jy2);
+            dz12             = _fjsp_sub_v2r8(iz1,jz2);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+            dx21             = _fjsp_sub_v2r8(ix2,jx1);
+            dy21             = _fjsp_sub_v2r8(iy2,jy1);
+            dz21             = _fjsp_sub_v2r8(iz2,jz1);
+            dx22             = _fjsp_sub_v2r8(ix2,jx2);
+            dy22             = _fjsp_sub_v2r8(iy2,jy2);
+            dz22             = _fjsp_sub_v2r8(iz2,jz2);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq01            = gmx_fjsp_calc_rsq_v2r8(dx01,dy01,dz01);
+            rsq02            = gmx_fjsp_calc_rsq_v2r8(dx02,dy02,dz02);
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+            rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+            rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+            rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv01           = gmx_fjsp_invsqrt_v2r8(rsq01);
+            rinv02           = gmx_fjsp_invsqrt_v2r8(rsq02);
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+            rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+            rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+            rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+            rinvsq01         = _fjsp_mul_v2r8(rinv01,rinv01);
+            rinvsq02         = _fjsp_mul_v2r8(rinv02,rinv02);
+            rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+            rinvsq11         = _fjsp_mul_v2r8(rinv11,rinv11);
+            rinvsq12         = _fjsp_mul_v2r8(rinv12,rinv12);
+            rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+            rinvsq21         = _fjsp_mul_v2r8(rinv21,rinv21);
+            rinvsq22         = _fjsp_mul_v2r8(rinv22,rinv22);
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+            fjx1             = _fjsp_setzero_v2r8();
+            fjy1             = _fjsp_setzero_v2r8();
+            fjz1             = _fjsp_setzero_v2r8();
+            fjx2             = _fjsp_setzero_v2r8();
+            fjy2             = _fjsp_setzero_v2r8();
+            fjz2             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
+            {
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq00,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq00,rinv00),crf));
+            felec            = _fjsp_mul_v2r8(qq00,_fjsp_msub_v2r8(rinv00,rinvsq00,krf2));
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+            vvdw6            = _fjsp_mul_v2r8(c6_00,rinvsix);
+            vvdw12           = _fjsp_mul_v2r8(c12_00,_fjsp_mul_v2r8(rinvsix,rinvsix));
+            vvdw             = _fjsp_msub_v2r8( vvdw12,one_twelfth, _fjsp_mul_v2r8(vvdw6,one_sixth) );
+            fvdw             = _fjsp_mul_v2r8(_fjsp_sub_v2r8(vvdw12,vvdw6),rinvsq00);
+
+            d                = _fjsp_sub_v2r8(r00,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            fvdw             = _fjsp_msub_v2r8( fvdw,sw , _fjsp_mul_v2r8(rinv00,_fjsp_mul_v2r8(vvdw,dsw)) );
+            vvdw             = _fjsp_mul_v2r8(vvdw,sw);
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+            vvdw             = _fjsp_and_v2r8(vvdw,cutoff_mask);
+            vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
+
+            fscal            = _fjsp_add_v2r8(felec,fvdw);
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq01,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq01,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq01,rinv01),crf));
+            felec            = _fjsp_mul_v2r8(qq01,_fjsp_msub_v2r8(rinv01,rinvsq01,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq01,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx01,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy01,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz01,fscal,fiz0);
+            
+            fjx1             = _fjsp_madd_v2r8(dx01,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy01,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz01,fscal,fjz1);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq02,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq02,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq02,rinv02),crf));
+            felec            = _fjsp_mul_v2r8(qq02,_fjsp_msub_v2r8(rinv02,rinvsq02,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq02,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx02,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy02,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz02,fscal,fiz0);
+            
+            fjx2             = _fjsp_madd_v2r8(dx02,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy02,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz02,fscal,fjz2);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq10,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq10,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq10,rinv10),crf));
+            felec            = _fjsp_mul_v2r8(qq10,_fjsp_msub_v2r8(rinv10,rinvsq10,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq10,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq11,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq11,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq11,rinv11),crf));
+            felec            = _fjsp_mul_v2r8(qq11,_fjsp_msub_v2r8(rinv11,rinvsq11,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq11,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+            
+            fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq12,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq12,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq12,rinv12),crf));
+            felec            = _fjsp_mul_v2r8(qq12,_fjsp_msub_v2r8(rinv12,rinvsq12,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq12,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+            
+            fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq20,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq20,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq20,rinv20),crf));
+            felec            = _fjsp_mul_v2r8(qq20,_fjsp_msub_v2r8(rinv20,rinvsq20,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq20,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq21,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq21,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq21,rinv21),crf));
+            felec            = _fjsp_mul_v2r8(qq21,_fjsp_msub_v2r8(rinv21,rinvsq21,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq21,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+            
+            fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq22,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq22,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq22,rinv22),crf));
+            felec            = _fjsp_mul_v2r8(qq22,_fjsp_msub_v2r8(rinv22,rinvsq22,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq22,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+            
+            fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+
+            }
+
+            gmx_fjsp_decrement_3rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+
+            /* Inner loop uses 385 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_3rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx01             = _fjsp_sub_v2r8(ix0,jx1);
+            dy01             = _fjsp_sub_v2r8(iy0,jy1);
+            dz01             = _fjsp_sub_v2r8(iz0,jz1);
+            dx02             = _fjsp_sub_v2r8(ix0,jx2);
+            dy02             = _fjsp_sub_v2r8(iy0,jy2);
+            dz02             = _fjsp_sub_v2r8(iz0,jz2);
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx11             = _fjsp_sub_v2r8(ix1,jx1);
+            dy11             = _fjsp_sub_v2r8(iy1,jy1);
+            dz11             = _fjsp_sub_v2r8(iz1,jz1);
+            dx12             = _fjsp_sub_v2r8(ix1,jx2);
+            dy12             = _fjsp_sub_v2r8(iy1,jy2);
+            dz12             = _fjsp_sub_v2r8(iz1,jz2);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+            dx21             = _fjsp_sub_v2r8(ix2,jx1);
+            dy21             = _fjsp_sub_v2r8(iy2,jy1);
+            dz21             = _fjsp_sub_v2r8(iz2,jz1);
+            dx22             = _fjsp_sub_v2r8(ix2,jx2);
+            dy22             = _fjsp_sub_v2r8(iy2,jy2);
+            dz22             = _fjsp_sub_v2r8(iz2,jz2);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq01            = gmx_fjsp_calc_rsq_v2r8(dx01,dy01,dz01);
+            rsq02            = gmx_fjsp_calc_rsq_v2r8(dx02,dy02,dz02);
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+            rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+            rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+            rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv01           = gmx_fjsp_invsqrt_v2r8(rsq01);
+            rinv02           = gmx_fjsp_invsqrt_v2r8(rsq02);
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+            rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+            rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+            rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+            rinvsq01         = _fjsp_mul_v2r8(rinv01,rinv01);
+            rinvsq02         = _fjsp_mul_v2r8(rinv02,rinv02);
+            rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+            rinvsq11         = _fjsp_mul_v2r8(rinv11,rinv11);
+            rinvsq12         = _fjsp_mul_v2r8(rinv12,rinv12);
+            rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+            rinvsq21         = _fjsp_mul_v2r8(rinv21,rinv21);
+            rinvsq22         = _fjsp_mul_v2r8(rinv22,rinv22);
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+            fjx1             = _fjsp_setzero_v2r8();
+            fjy1             = _fjsp_setzero_v2r8();
+            fjz1             = _fjsp_setzero_v2r8();
+            fjx2             = _fjsp_setzero_v2r8();
+            fjy2             = _fjsp_setzero_v2r8();
+            fjz2             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
+            {
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq00,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq00,rinv00),crf));
+            felec            = _fjsp_mul_v2r8(qq00,_fjsp_msub_v2r8(rinv00,rinvsq00,krf2));
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+            vvdw6            = _fjsp_mul_v2r8(c6_00,rinvsix);
+            vvdw12           = _fjsp_mul_v2r8(c12_00,_fjsp_mul_v2r8(rinvsix,rinvsix));
+            vvdw             = _fjsp_msub_v2r8( vvdw12,one_twelfth, _fjsp_mul_v2r8(vvdw6,one_sixth) );
+            fvdw             = _fjsp_mul_v2r8(_fjsp_sub_v2r8(vvdw12,vvdw6),rinvsq00);
+
+            d                = _fjsp_sub_v2r8(r00,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            fvdw             = _fjsp_msub_v2r8( fvdw,sw , _fjsp_mul_v2r8(rinv00,_fjsp_mul_v2r8(vvdw,dsw)) );
+            vvdw             = _fjsp_mul_v2r8(vvdw,sw);
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+            vvdw             = _fjsp_and_v2r8(vvdw,cutoff_mask);
+            vvdw             = _fjsp_unpacklo_v2r8(vvdw,_fjsp_setzero_v2r8());
+            vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
+
+            fscal            = _fjsp_add_v2r8(felec,fvdw);
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq01,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq01,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq01,rinv01),crf));
+            felec            = _fjsp_mul_v2r8(qq01,_fjsp_msub_v2r8(rinv01,rinvsq01,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq01,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx01,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy01,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz01,fscal,fiz0);
+            
+            fjx1             = _fjsp_madd_v2r8(dx01,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy01,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz01,fscal,fjz1);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq02,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq02,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq02,rinv02),crf));
+            felec            = _fjsp_mul_v2r8(qq02,_fjsp_msub_v2r8(rinv02,rinvsq02,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq02,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx02,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy02,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz02,fscal,fiz0);
+            
+            fjx2             = _fjsp_madd_v2r8(dx02,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy02,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz02,fscal,fjz2);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq10,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq10,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq10,rinv10),crf));
+            felec            = _fjsp_mul_v2r8(qq10,_fjsp_msub_v2r8(rinv10,rinvsq10,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq10,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq11,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq11,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq11,rinv11),crf));
+            felec            = _fjsp_mul_v2r8(qq11,_fjsp_msub_v2r8(rinv11,rinvsq11,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq11,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+            
+            fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq12,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq12,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq12,rinv12),crf));
+            felec            = _fjsp_mul_v2r8(qq12,_fjsp_msub_v2r8(rinv12,rinvsq12,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq12,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+            
+            fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq20,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq20,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq20,rinv20),crf));
+            felec            = _fjsp_mul_v2r8(qq20,_fjsp_msub_v2r8(rinv20,rinvsq20,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq20,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq21,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq21,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq21,rinv21),crf));
+            felec            = _fjsp_mul_v2r8(qq21,_fjsp_msub_v2r8(rinv21,rinvsq21,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq21,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+            
+            fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq22,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq22,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq22,rinv22),crf));
+            felec            = _fjsp_mul_v2r8(qq22,_fjsp_msub_v2r8(rinv22,rinvsq22,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq22,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+            
+            fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+
+            }
+
+            gmx_fjsp_decrement_3rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+
+            /* Inner loop uses 385 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_fjsp_update_iforce_3atom_swizzle_v2r8(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        ggid                        = gid[iidx];
+        /* Update potential energies */
+        gmx_fjsp_update_1pot_v2r8(velecsum,kernel_data->energygrp_elec+ggid);
+        gmx_fjsp_update_1pot_v2r8(vvdwsum,kernel_data->energygrp_vdw+ggid);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 20 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_VF,outeriter*20 + inneriter*385);
+}
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecRFCut_VdwLJSw_GeomW3W3_F_sparc64_hpc_ace_double
+ * Electrostatics interaction: ReactionField
+ * VdW interaction:            LennardJones
+ * Geometry:                   Water3-Water3
+ * Calculate force/pot:        Force
+ */
+void
+nb_kernel_ElecRFCut_VdwLJSw_GeomW3W3_F_sparc64_hpc_ace_double
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwjidx0A,vdwjidx0B;
+    _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    int              vdwjidx1A,vdwjidx1B;
+    _fjsp_v2r8       jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+    int              vdwjidx2A,vdwjidx2B;
+    _fjsp_v2r8       jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+    _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    _fjsp_v2r8       dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
+    _fjsp_v2r8       dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
+    _fjsp_v2r8       dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    _fjsp_v2r8       dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+    _fjsp_v2r8       dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+    _fjsp_v2r8       dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    _fjsp_v2r8       dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+    _fjsp_v2r8       dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+    _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
+    _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
+    _fjsp_v2r8       rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
+    real             rswitch_scalar,d_scalar;
+    _fjsp_v2r8       itab_tmp;
+    _fjsp_v2r8       dummy_mask,cutoff_mask;
+    _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+    _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+    union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    krf              = gmx_fjsp_set1_v2r8(fr->ic->k_rf);
+    krf2             = gmx_fjsp_set1_v2r8(fr->ic->k_rf*2.0);
+    crf              = gmx_fjsp_set1_v2r8(fr->ic->c_rf);
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq0              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+0]));
+    iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+    iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+    vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+    jq0              = gmx_fjsp_set1_v2r8(charge[inr+0]);
+    jq1              = gmx_fjsp_set1_v2r8(charge[inr+1]);
+    jq2              = gmx_fjsp_set1_v2r8(charge[inr+2]);
+    vdwjidx0A        = 2*vdwtype[inr+0];
+    qq00             = _fjsp_mul_v2r8(iq0,jq0);
+    c6_00            = gmx_fjsp_set1_v2r8(vdwparam[vdwioffset0+vdwjidx0A]);
+    c12_00           = gmx_fjsp_set1_v2r8(vdwparam[vdwioffset0+vdwjidx0A+1]);
+    qq01             = _fjsp_mul_v2r8(iq0,jq1);
+    qq02             = _fjsp_mul_v2r8(iq0,jq2);
+    qq10             = _fjsp_mul_v2r8(iq1,jq0);
+    qq11             = _fjsp_mul_v2r8(iq1,jq1);
+    qq12             = _fjsp_mul_v2r8(iq1,jq2);
+    qq20             = _fjsp_mul_v2r8(iq2,jq0);
+    qq21             = _fjsp_mul_v2r8(iq2,jq1);
+    qq22             = _fjsp_mul_v2r8(iq2,jq2);
+
+    /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+    rcutoff_scalar   = fr->rcoulomb;
+    rcutoff          = gmx_fjsp_set1_v2r8(rcutoff_scalar);
+    rcutoff2         = _fjsp_mul_v2r8(rcutoff,rcutoff);
+
+    rswitch_scalar   = fr->rvdw_switch;
+    rswitch          = gmx_fjsp_set1_v2r8(rswitch_scalar);
+    /* Setup switch parameters */
+    d_scalar         = rcutoff_scalar-rswitch_scalar;
+    d                = gmx_fjsp_set1_v2r8(d_scalar);
+    swV3             = gmx_fjsp_set1_v2r8(-10.0/(d_scalar*d_scalar*d_scalar));
+    swV4             = gmx_fjsp_set1_v2r8( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar));
+    swV5             = gmx_fjsp_set1_v2r8( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
+    swF2             = gmx_fjsp_set1_v2r8(-30.0/(d_scalar*d_scalar*d_scalar));
+    swF3             = gmx_fjsp_set1_v2r8( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar));
+    swF4             = gmx_fjsp_set1_v2r8(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_fjsp_load_shift_and_3rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
+
+        fix0             = _fjsp_setzero_v2r8();
+        fiy0             = _fjsp_setzero_v2r8();
+        fiz0             = _fjsp_setzero_v2r8();
+        fix1             = _fjsp_setzero_v2r8();
+        fiy1             = _fjsp_setzero_v2r8();
+        fiz1             = _fjsp_setzero_v2r8();
+        fix2             = _fjsp_setzero_v2r8();
+        fiy2             = _fjsp_setzero_v2r8();
+        fiz2             = _fjsp_setzero_v2r8();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_3rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx01             = _fjsp_sub_v2r8(ix0,jx1);
+            dy01             = _fjsp_sub_v2r8(iy0,jy1);
+            dz01             = _fjsp_sub_v2r8(iz0,jz1);
+            dx02             = _fjsp_sub_v2r8(ix0,jx2);
+            dy02             = _fjsp_sub_v2r8(iy0,jy2);
+            dz02             = _fjsp_sub_v2r8(iz0,jz2);
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx11             = _fjsp_sub_v2r8(ix1,jx1);
+            dy11             = _fjsp_sub_v2r8(iy1,jy1);
+            dz11             = _fjsp_sub_v2r8(iz1,jz1);
+            dx12             = _fjsp_sub_v2r8(ix1,jx2);
+            dy12             = _fjsp_sub_v2r8(iy1,jy2);
+            dz12             = _fjsp_sub_v2r8(iz1,jz2);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+            dx21             = _fjsp_sub_v2r8(ix2,jx1);
+            dy21             = _fjsp_sub_v2r8(iy2,jy1);
+            dz21             = _fjsp_sub_v2r8(iz2,jz1);
+            dx22             = _fjsp_sub_v2r8(ix2,jx2);
+            dy22             = _fjsp_sub_v2r8(iy2,jy2);
+            dz22             = _fjsp_sub_v2r8(iz2,jz2);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq01            = gmx_fjsp_calc_rsq_v2r8(dx01,dy01,dz01);
+            rsq02            = gmx_fjsp_calc_rsq_v2r8(dx02,dy02,dz02);
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+            rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+            rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+            rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv01           = gmx_fjsp_invsqrt_v2r8(rsq01);
+            rinv02           = gmx_fjsp_invsqrt_v2r8(rsq02);
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+            rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+            rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+            rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+            rinvsq01         = _fjsp_mul_v2r8(rinv01,rinv01);
+            rinvsq02         = _fjsp_mul_v2r8(rinv02,rinv02);
+            rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+            rinvsq11         = _fjsp_mul_v2r8(rinv11,rinv11);
+            rinvsq12         = _fjsp_mul_v2r8(rinv12,rinv12);
+            rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+            rinvsq21         = _fjsp_mul_v2r8(rinv21,rinv21);
+            rinvsq22         = _fjsp_mul_v2r8(rinv22,rinv22);
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+            fjx1             = _fjsp_setzero_v2r8();
+            fjy1             = _fjsp_setzero_v2r8();
+            fjz1             = _fjsp_setzero_v2r8();
+            fjx2             = _fjsp_setzero_v2r8();
+            fjy2             = _fjsp_setzero_v2r8();
+            fjz2             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
+            {
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq00,_fjsp_msub_v2r8(rinv00,rinvsq00,krf2));
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+            vvdw6            = _fjsp_mul_v2r8(c6_00,rinvsix);
+            vvdw12           = _fjsp_mul_v2r8(c12_00,_fjsp_mul_v2r8(rinvsix,rinvsix));
+            vvdw             = _fjsp_msub_v2r8( vvdw12,one_twelfth, _fjsp_mul_v2r8(vvdw6,one_sixth) );
+            fvdw             = _fjsp_mul_v2r8(_fjsp_sub_v2r8(vvdw12,vvdw6),rinvsq00);
+
+            d                = _fjsp_sub_v2r8(r00,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            fvdw             = _fjsp_msub_v2r8( fvdw,sw , _fjsp_mul_v2r8(rinv00,_fjsp_mul_v2r8(vvdw,dsw)) );
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
+
+            fscal            = _fjsp_add_v2r8(felec,fvdw);
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq01,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq01,_fjsp_msub_v2r8(rinv01,rinvsq01,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq01,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx01,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy01,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz01,fscal,fiz0);
+            
+            fjx1             = _fjsp_madd_v2r8(dx01,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy01,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz01,fscal,fjz1);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq02,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq02,_fjsp_msub_v2r8(rinv02,rinvsq02,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq02,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx02,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy02,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz02,fscal,fiz0);
+            
+            fjx2             = _fjsp_madd_v2r8(dx02,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy02,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz02,fscal,fjz2);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq10,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq10,_fjsp_msub_v2r8(rinv10,rinvsq10,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq10,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq11,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq11,_fjsp_msub_v2r8(rinv11,rinvsq11,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq11,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+            
+            fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq12,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq12,_fjsp_msub_v2r8(rinv12,rinvsq12,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq12,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+            
+            fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq20,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq20,_fjsp_msub_v2r8(rinv20,rinvsq20,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq20,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq21,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq21,_fjsp_msub_v2r8(rinv21,rinvsq21,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq21,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+            
+            fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq22,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq22,_fjsp_msub_v2r8(rinv22,rinvsq22,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq22,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+            
+            fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+
+            }
+
+            gmx_fjsp_decrement_3rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+
+            /* Inner loop uses 328 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_3rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx01             = _fjsp_sub_v2r8(ix0,jx1);
+            dy01             = _fjsp_sub_v2r8(iy0,jy1);
+            dz01             = _fjsp_sub_v2r8(iz0,jz1);
+            dx02             = _fjsp_sub_v2r8(ix0,jx2);
+            dy02             = _fjsp_sub_v2r8(iy0,jy2);
+            dz02             = _fjsp_sub_v2r8(iz0,jz2);
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx11             = _fjsp_sub_v2r8(ix1,jx1);
+            dy11             = _fjsp_sub_v2r8(iy1,jy1);
+            dz11             = _fjsp_sub_v2r8(iz1,jz1);
+            dx12             = _fjsp_sub_v2r8(ix1,jx2);
+            dy12             = _fjsp_sub_v2r8(iy1,jy2);
+            dz12             = _fjsp_sub_v2r8(iz1,jz2);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+            dx21             = _fjsp_sub_v2r8(ix2,jx1);
+            dy21             = _fjsp_sub_v2r8(iy2,jy1);
+            dz21             = _fjsp_sub_v2r8(iz2,jz1);
+            dx22             = _fjsp_sub_v2r8(ix2,jx2);
+            dy22             = _fjsp_sub_v2r8(iy2,jy2);
+            dz22             = _fjsp_sub_v2r8(iz2,jz2);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq01            = gmx_fjsp_calc_rsq_v2r8(dx01,dy01,dz01);
+            rsq02            = gmx_fjsp_calc_rsq_v2r8(dx02,dy02,dz02);
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+            rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+            rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+            rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv01           = gmx_fjsp_invsqrt_v2r8(rsq01);
+            rinv02           = gmx_fjsp_invsqrt_v2r8(rsq02);
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+            rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+            rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+            rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+            rinvsq01         = _fjsp_mul_v2r8(rinv01,rinv01);
+            rinvsq02         = _fjsp_mul_v2r8(rinv02,rinv02);
+            rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+            rinvsq11         = _fjsp_mul_v2r8(rinv11,rinv11);
+            rinvsq12         = _fjsp_mul_v2r8(rinv12,rinv12);
+            rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+            rinvsq21         = _fjsp_mul_v2r8(rinv21,rinv21);
+            rinvsq22         = _fjsp_mul_v2r8(rinv22,rinv22);
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+            fjx1             = _fjsp_setzero_v2r8();
+            fjy1             = _fjsp_setzero_v2r8();
+            fjz1             = _fjsp_setzero_v2r8();
+            fjx2             = _fjsp_setzero_v2r8();
+            fjy2             = _fjsp_setzero_v2r8();
+            fjz2             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
+            {
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq00,_fjsp_msub_v2r8(rinv00,rinvsq00,krf2));
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+            vvdw6            = _fjsp_mul_v2r8(c6_00,rinvsix);
+            vvdw12           = _fjsp_mul_v2r8(c12_00,_fjsp_mul_v2r8(rinvsix,rinvsix));
+            vvdw             = _fjsp_msub_v2r8( vvdw12,one_twelfth, _fjsp_mul_v2r8(vvdw6,one_sixth) );
+            fvdw             = _fjsp_mul_v2r8(_fjsp_sub_v2r8(vvdw12,vvdw6),rinvsq00);
+
+            d                = _fjsp_sub_v2r8(r00,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            fvdw             = _fjsp_msub_v2r8( fvdw,sw , _fjsp_mul_v2r8(rinv00,_fjsp_mul_v2r8(vvdw,dsw)) );
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
+
+            fscal            = _fjsp_add_v2r8(felec,fvdw);
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq01,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq01,_fjsp_msub_v2r8(rinv01,rinvsq01,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq01,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx01,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy01,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz01,fscal,fiz0);
+            
+            fjx1             = _fjsp_madd_v2r8(dx01,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy01,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz01,fscal,fjz1);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq02,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq02,_fjsp_msub_v2r8(rinv02,rinvsq02,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq02,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx02,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy02,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz02,fscal,fiz0);
+            
+            fjx2             = _fjsp_madd_v2r8(dx02,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy02,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz02,fscal,fjz2);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq10,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq10,_fjsp_msub_v2r8(rinv10,rinvsq10,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq10,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq11,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq11,_fjsp_msub_v2r8(rinv11,rinvsq11,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq11,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+            
+            fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq12,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq12,_fjsp_msub_v2r8(rinv12,rinvsq12,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq12,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+            
+            fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq20,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq20,_fjsp_msub_v2r8(rinv20,rinvsq20,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq20,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq21,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq21,_fjsp_msub_v2r8(rinv21,rinvsq21,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq21,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+            
+            fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq22,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq22,_fjsp_msub_v2r8(rinv22,rinvsq22,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq22,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+            
+            fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+
+            }
+
+            gmx_fjsp_decrement_3rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+
+            /* Inner loop uses 328 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_fjsp_update_iforce_3atom_swizzle_v2r8(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 18 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_F,outeriter*18 + inneriter*328);
+}
diff --git a/src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecRFCut_VdwLJSw_GeomW4P1_sparc64_hpc_ace_double.c b/src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecRFCut_VdwLJSw_GeomW4P1_sparc64_hpc_ace_double.c
new file mode 100644 (file)
index 0000000..7add775
--- /dev/null
@@ -0,0 +1,1213 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2012, by the GROMACS development team, led by
+ * David van der Spoel, Berk Hess, Erik Lindahl, and including many
+ * others, as listed in the AUTHORS file in the top-level source
+ * directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*
+ * Note: this file was generated by the GROMACS sparc64_hpc_ace_double kernel generator.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "kernelutil_sparc64_hpc_ace_double.h"
+
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecRFCut_VdwLJSw_GeomW4P1_VF_sparc64_hpc_ace_double
+ * Electrostatics interaction: ReactionField
+ * VdW interaction:            LennardJones
+ * Geometry:                   Water4-Particle
+ * Calculate force/pot:        PotentialAndForce
+ */
+void
+nb_kernel_ElecRFCut_VdwLJSw_GeomW4P1_VF_sparc64_hpc_ace_double
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwioffset3;
+    _fjsp_v2r8       ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+    int              vdwjidx0A,vdwjidx0B;
+    _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    _fjsp_v2r8       dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    _fjsp_v2r8       dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    _fjsp_v2r8       dx30,dy30,dz30,rsq30,rinv30,rinvsq30,r30,qq30,c6_30,c12_30;
+    _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
+    _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
+    _fjsp_v2r8       rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
+    real             rswitch_scalar,d_scalar;
+    _fjsp_v2r8       itab_tmp;
+    _fjsp_v2r8       dummy_mask,cutoff_mask;
+    _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+    _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+    union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    krf              = gmx_fjsp_set1_v2r8(fr->ic->k_rf);
+    krf2             = gmx_fjsp_set1_v2r8(fr->ic->k_rf*2.0);
+    crf              = gmx_fjsp_set1_v2r8(fr->ic->c_rf);
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+    iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+    iq3              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+3]));
+    vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+    /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+    rcutoff_scalar   = fr->rcoulomb;
+    rcutoff          = gmx_fjsp_set1_v2r8(rcutoff_scalar);
+    rcutoff2         = _fjsp_mul_v2r8(rcutoff,rcutoff);
+
+    rswitch_scalar   = fr->rvdw_switch;
+    rswitch          = gmx_fjsp_set1_v2r8(rswitch_scalar);
+    /* Setup switch parameters */
+    d_scalar         = rcutoff_scalar-rswitch_scalar;
+    d                = gmx_fjsp_set1_v2r8(d_scalar);
+    swV3             = gmx_fjsp_set1_v2r8(-10.0/(d_scalar*d_scalar*d_scalar));
+    swV4             = gmx_fjsp_set1_v2r8( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar));
+    swV5             = gmx_fjsp_set1_v2r8( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
+    swF2             = gmx_fjsp_set1_v2r8(-30.0/(d_scalar*d_scalar*d_scalar));
+    swF3             = gmx_fjsp_set1_v2r8( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar));
+    swF4             = gmx_fjsp_set1_v2r8(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_fjsp_load_shift_and_4rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
+
+        fix0             = _fjsp_setzero_v2r8();
+        fiy0             = _fjsp_setzero_v2r8();
+        fiz0             = _fjsp_setzero_v2r8();
+        fix1             = _fjsp_setzero_v2r8();
+        fiy1             = _fjsp_setzero_v2r8();
+        fiz1             = _fjsp_setzero_v2r8();
+        fix2             = _fjsp_setzero_v2r8();
+        fiy2             = _fjsp_setzero_v2r8();
+        fiz2             = _fjsp_setzero_v2r8();
+        fix3             = _fjsp_setzero_v2r8();
+        fiy3             = _fjsp_setzero_v2r8();
+        fiz3             = _fjsp_setzero_v2r8();
+
+        /* Reset potential sums */
+        velecsum         = _fjsp_setzero_v2r8();
+        vvdwsum          = _fjsp_setzero_v2r8();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+            dx30             = _fjsp_sub_v2r8(ix3,jx0);
+            dy30             = _fjsp_sub_v2r8(iy3,jy0);
+            dz30             = _fjsp_sub_v2r8(iz3,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+            rsq30            = gmx_fjsp_calc_rsq_v2r8(dx30,dy30,dz30);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+            rinv30           = gmx_fjsp_invsqrt_v2r8(rsq30);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+            rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+            rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+            rinvsq30         = _fjsp_mul_v2r8(rinv30,rinv30);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_fjsp_load_2real_swizzle_v2r8(charge+jnrA+0,charge+jnrB+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
+            {
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            gmx_fjsp_load_2pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,&c6_00,&c12_00);
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+            vvdw6            = _fjsp_mul_v2r8(c6_00,rinvsix);
+            vvdw12           = _fjsp_mul_v2r8(c12_00,_fjsp_mul_v2r8(rinvsix,rinvsix));
+            vvdw             = _fjsp_msub_v2r8( vvdw12,one_twelfth, _fjsp_mul_v2r8(vvdw6,one_sixth) );
+            fvdw             = _fjsp_mul_v2r8(_fjsp_sub_v2r8(vvdw12,vvdw6),rinvsq00);
+
+            d                = _fjsp_sub_v2r8(r00,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            fvdw             = _fjsp_msub_v2r8( fvdw,sw , _fjsp_mul_v2r8(rinv00,_fjsp_mul_v2r8(vvdw,dsw)) );
+            vvdw             = _fjsp_mul_v2r8(vvdw,sw);
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            vvdw             = _fjsp_and_v2r8(vvdw,cutoff_mask);
+            vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
+
+            fscal            = fvdw;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq10,rcutoff2))
+            {
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _fjsp_mul_v2r8(iq1,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq10,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq10,rinv10),crf));
+            felec            = _fjsp_mul_v2r8(qq10,_fjsp_msub_v2r8(rinv10,rinvsq10,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq10,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq20,rcutoff2))
+            {
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _fjsp_mul_v2r8(iq2,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq20,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq20,rinv20),crf));
+            felec            = _fjsp_mul_v2r8(qq20,_fjsp_msub_v2r8(rinv20,rinvsq20,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq20,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq30,rcutoff2))
+            {
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq30             = _fjsp_mul_v2r8(iq3,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq30,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq30,rinv30),crf));
+            felec            = _fjsp_mul_v2r8(qq30,_fjsp_msub_v2r8(rinv30,rinvsq30,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq30,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx30,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy30,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz30,fscal,fiz3);
+            
+            fjx0             = _fjsp_madd_v2r8(dx30,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy30,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz30,fscal,fjz0);
+
+            }
+
+            gmx_fjsp_decrement_1rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 182 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+            dx30             = _fjsp_sub_v2r8(ix3,jx0);
+            dy30             = _fjsp_sub_v2r8(iy3,jy0);
+            dz30             = _fjsp_sub_v2r8(iz3,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+            rsq30            = gmx_fjsp_calc_rsq_v2r8(dx30,dy30,dz30);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+            rinv30           = gmx_fjsp_invsqrt_v2r8(rsq30);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+            rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+            rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+            rinvsq30         = _fjsp_mul_v2r8(rinv30,rinv30);
+
+            /* Load parameters for j particles */
+            jq0              = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),charge+jnrA+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
+            {
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            gmx_fjsp_load_1pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,&c6_00,&c12_00);
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+            vvdw6            = _fjsp_mul_v2r8(c6_00,rinvsix);
+            vvdw12           = _fjsp_mul_v2r8(c12_00,_fjsp_mul_v2r8(rinvsix,rinvsix));
+            vvdw             = _fjsp_msub_v2r8( vvdw12,one_twelfth, _fjsp_mul_v2r8(vvdw6,one_sixth) );
+            fvdw             = _fjsp_mul_v2r8(_fjsp_sub_v2r8(vvdw12,vvdw6),rinvsq00);
+
+            d                = _fjsp_sub_v2r8(r00,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            fvdw             = _fjsp_msub_v2r8( fvdw,sw , _fjsp_mul_v2r8(rinv00,_fjsp_mul_v2r8(vvdw,dsw)) );
+            vvdw             = _fjsp_mul_v2r8(vvdw,sw);
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            vvdw             = _fjsp_and_v2r8(vvdw,cutoff_mask);
+            vvdw             = _fjsp_unpacklo_v2r8(vvdw,_fjsp_setzero_v2r8());
+            vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
+
+            fscal            = fvdw;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq10,rcutoff2))
+            {
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _fjsp_mul_v2r8(iq1,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq10,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq10,rinv10),crf));
+            felec            = _fjsp_mul_v2r8(qq10,_fjsp_msub_v2r8(rinv10,rinvsq10,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq10,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq20,rcutoff2))
+            {
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _fjsp_mul_v2r8(iq2,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq20,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq20,rinv20),crf));
+            felec            = _fjsp_mul_v2r8(qq20,_fjsp_msub_v2r8(rinv20,rinvsq20,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq20,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq30,rcutoff2))
+            {
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq30             = _fjsp_mul_v2r8(iq3,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq30,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq30,rinv30),crf));
+            felec            = _fjsp_mul_v2r8(qq30,_fjsp_msub_v2r8(rinv30,rinvsq30,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq30,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx30,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy30,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz30,fscal,fiz3);
+            
+            fjx0             = _fjsp_madd_v2r8(dx30,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy30,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz30,fscal,fjz0);
+
+            }
+
+            gmx_fjsp_decrement_1rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 182 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_fjsp_update_iforce_4atom_swizzle_v2r8(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        ggid                        = gid[iidx];
+        /* Update potential energies */
+        gmx_fjsp_update_1pot_v2r8(velecsum,kernel_data->energygrp_elec+ggid);
+        gmx_fjsp_update_1pot_v2r8(vvdwsum,kernel_data->energygrp_vdw+ggid);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 26 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4_VF,outeriter*26 + inneriter*182);
+}
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecRFCut_VdwLJSw_GeomW4P1_F_sparc64_hpc_ace_double
+ * Electrostatics interaction: ReactionField
+ * VdW interaction:            LennardJones
+ * Geometry:                   Water4-Particle
+ * Calculate force/pot:        Force
+ */
+void
+nb_kernel_ElecRFCut_VdwLJSw_GeomW4P1_F_sparc64_hpc_ace_double
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwioffset3;
+    _fjsp_v2r8       ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+    int              vdwjidx0A,vdwjidx0B;
+    _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    _fjsp_v2r8       dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    _fjsp_v2r8       dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    _fjsp_v2r8       dx30,dy30,dz30,rsq30,rinv30,rinvsq30,r30,qq30,c6_30,c12_30;
+    _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
+    _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
+    _fjsp_v2r8       rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
+    real             rswitch_scalar,d_scalar;
+    _fjsp_v2r8       itab_tmp;
+    _fjsp_v2r8       dummy_mask,cutoff_mask;
+    _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+    _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+    union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    krf              = gmx_fjsp_set1_v2r8(fr->ic->k_rf);
+    krf2             = gmx_fjsp_set1_v2r8(fr->ic->k_rf*2.0);
+    crf              = gmx_fjsp_set1_v2r8(fr->ic->c_rf);
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+    iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+    iq3              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+3]));
+    vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+    /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+    rcutoff_scalar   = fr->rcoulomb;
+    rcutoff          = gmx_fjsp_set1_v2r8(rcutoff_scalar);
+    rcutoff2         = _fjsp_mul_v2r8(rcutoff,rcutoff);
+
+    rswitch_scalar   = fr->rvdw_switch;
+    rswitch          = gmx_fjsp_set1_v2r8(rswitch_scalar);
+    /* Setup switch parameters */
+    d_scalar         = rcutoff_scalar-rswitch_scalar;
+    d                = gmx_fjsp_set1_v2r8(d_scalar);
+    swV3             = gmx_fjsp_set1_v2r8(-10.0/(d_scalar*d_scalar*d_scalar));
+    swV4             = gmx_fjsp_set1_v2r8( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar));
+    swV5             = gmx_fjsp_set1_v2r8( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
+    swF2             = gmx_fjsp_set1_v2r8(-30.0/(d_scalar*d_scalar*d_scalar));
+    swF3             = gmx_fjsp_set1_v2r8( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar));
+    swF4             = gmx_fjsp_set1_v2r8(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_fjsp_load_shift_and_4rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
+
+        fix0             = _fjsp_setzero_v2r8();
+        fiy0             = _fjsp_setzero_v2r8();
+        fiz0             = _fjsp_setzero_v2r8();
+        fix1             = _fjsp_setzero_v2r8();
+        fiy1             = _fjsp_setzero_v2r8();
+        fiz1             = _fjsp_setzero_v2r8();
+        fix2             = _fjsp_setzero_v2r8();
+        fiy2             = _fjsp_setzero_v2r8();
+        fiz2             = _fjsp_setzero_v2r8();
+        fix3             = _fjsp_setzero_v2r8();
+        fiy3             = _fjsp_setzero_v2r8();
+        fiz3             = _fjsp_setzero_v2r8();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+            dx30             = _fjsp_sub_v2r8(ix3,jx0);
+            dy30             = _fjsp_sub_v2r8(iy3,jy0);
+            dz30             = _fjsp_sub_v2r8(iz3,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+            rsq30            = gmx_fjsp_calc_rsq_v2r8(dx30,dy30,dz30);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+            rinv30           = gmx_fjsp_invsqrt_v2r8(rsq30);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+            rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+            rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+            rinvsq30         = _fjsp_mul_v2r8(rinv30,rinv30);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_fjsp_load_2real_swizzle_v2r8(charge+jnrA+0,charge+jnrB+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
+            {
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            gmx_fjsp_load_2pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,&c6_00,&c12_00);
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+            vvdw6            = _fjsp_mul_v2r8(c6_00,rinvsix);
+            vvdw12           = _fjsp_mul_v2r8(c12_00,_fjsp_mul_v2r8(rinvsix,rinvsix));
+            vvdw             = _fjsp_msub_v2r8( vvdw12,one_twelfth, _fjsp_mul_v2r8(vvdw6,one_sixth) );
+            fvdw             = _fjsp_mul_v2r8(_fjsp_sub_v2r8(vvdw12,vvdw6),rinvsq00);
+
+            d                = _fjsp_sub_v2r8(r00,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            fvdw             = _fjsp_msub_v2r8( fvdw,sw , _fjsp_mul_v2r8(rinv00,_fjsp_mul_v2r8(vvdw,dsw)) );
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
+
+            fscal            = fvdw;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq10,rcutoff2))
+            {
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _fjsp_mul_v2r8(iq1,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq10,_fjsp_msub_v2r8(rinv10,rinvsq10,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq10,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq20,rcutoff2))
+            {
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _fjsp_mul_v2r8(iq2,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq20,_fjsp_msub_v2r8(rinv20,rinvsq20,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq20,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq30,rcutoff2))
+            {
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq30             = _fjsp_mul_v2r8(iq3,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq30,_fjsp_msub_v2r8(rinv30,rinvsq30,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq30,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx30,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy30,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz30,fscal,fiz3);
+            
+            fjx0             = _fjsp_madd_v2r8(dx30,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy30,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz30,fscal,fjz0);
+
+            }
+
+            gmx_fjsp_decrement_1rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 161 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+            dx30             = _fjsp_sub_v2r8(ix3,jx0);
+            dy30             = _fjsp_sub_v2r8(iy3,jy0);
+            dz30             = _fjsp_sub_v2r8(iz3,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+            rsq30            = gmx_fjsp_calc_rsq_v2r8(dx30,dy30,dz30);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+            rinv30           = gmx_fjsp_invsqrt_v2r8(rsq30);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+            rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+            rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+            rinvsq30         = _fjsp_mul_v2r8(rinv30,rinv30);
+
+            /* Load parameters for j particles */
+            jq0              = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),charge+jnrA+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
+            {
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            gmx_fjsp_load_1pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,&c6_00,&c12_00);
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+            vvdw6            = _fjsp_mul_v2r8(c6_00,rinvsix);
+            vvdw12           = _fjsp_mul_v2r8(c12_00,_fjsp_mul_v2r8(rinvsix,rinvsix));
+            vvdw             = _fjsp_msub_v2r8( vvdw12,one_twelfth, _fjsp_mul_v2r8(vvdw6,one_sixth) );
+            fvdw             = _fjsp_mul_v2r8(_fjsp_sub_v2r8(vvdw12,vvdw6),rinvsq00);
+
+            d                = _fjsp_sub_v2r8(r00,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            fvdw             = _fjsp_msub_v2r8( fvdw,sw , _fjsp_mul_v2r8(rinv00,_fjsp_mul_v2r8(vvdw,dsw)) );
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
+
+            fscal            = fvdw;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq10,rcutoff2))
+            {
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _fjsp_mul_v2r8(iq1,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq10,_fjsp_msub_v2r8(rinv10,rinvsq10,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq10,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq20,rcutoff2))
+            {
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _fjsp_mul_v2r8(iq2,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq20,_fjsp_msub_v2r8(rinv20,rinvsq20,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq20,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq30,rcutoff2))
+            {
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq30             = _fjsp_mul_v2r8(iq3,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq30,_fjsp_msub_v2r8(rinv30,rinvsq30,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq30,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx30,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy30,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz30,fscal,fiz3);
+            
+            fjx0             = _fjsp_madd_v2r8(dx30,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy30,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz30,fscal,fjz0);
+
+            }
+
+            gmx_fjsp_decrement_1rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 161 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_fjsp_update_iforce_4atom_swizzle_v2r8(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 24 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4_F,outeriter*24 + inneriter*161);
+}
diff --git a/src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecRFCut_VdwLJSw_GeomW4W4_sparc64_hpc_ace_double.c b/src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecRFCut_VdwLJSw_GeomW4W4_sparc64_hpc_ace_double.c
new file mode 100644 (file)
index 0000000..0d66250
--- /dev/null
@@ -0,0 +1,2123 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2012, by the GROMACS development team, led by
+ * David van der Spoel, Berk Hess, Erik Lindahl, and including many
+ * others, as listed in the AUTHORS file in the top-level source
+ * directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*
+ * Note: this file was generated by the GROMACS sparc64_hpc_ace_double kernel generator.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "kernelutil_sparc64_hpc_ace_double.h"
+
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecRFCut_VdwLJSw_GeomW4W4_VF_sparc64_hpc_ace_double
+ * Electrostatics interaction: ReactionField
+ * VdW interaction:            LennardJones
+ * Geometry:                   Water4-Water4
+ * Calculate force/pot:        PotentialAndForce
+ */
+void
+nb_kernel_ElecRFCut_VdwLJSw_GeomW4W4_VF_sparc64_hpc_ace_double
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwioffset3;
+    _fjsp_v2r8       ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+    int              vdwjidx0A,vdwjidx0B;
+    _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    int              vdwjidx1A,vdwjidx1B;
+    _fjsp_v2r8       jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+    int              vdwjidx2A,vdwjidx2B;
+    _fjsp_v2r8       jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+    int              vdwjidx3A,vdwjidx3B;
+    _fjsp_v2r8       jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
+    _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    _fjsp_v2r8       dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+    _fjsp_v2r8       dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+    _fjsp_v2r8       dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
+    _fjsp_v2r8       dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+    _fjsp_v2r8       dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+    _fjsp_v2r8       dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
+    _fjsp_v2r8       dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
+    _fjsp_v2r8       dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
+    _fjsp_v2r8       dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
+    _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
+    _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
+    _fjsp_v2r8       rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
+    real             rswitch_scalar,d_scalar;
+    _fjsp_v2r8       itab_tmp;
+    _fjsp_v2r8       dummy_mask,cutoff_mask;
+    _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+    _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+    union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    krf              = gmx_fjsp_set1_v2r8(fr->ic->k_rf);
+    krf2             = gmx_fjsp_set1_v2r8(fr->ic->k_rf*2.0);
+    crf              = gmx_fjsp_set1_v2r8(fr->ic->c_rf);
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+    iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+    iq3              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+3]));
+    vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+    jq1              = gmx_fjsp_set1_v2r8(charge[inr+1]);
+    jq2              = gmx_fjsp_set1_v2r8(charge[inr+2]);
+    jq3              = gmx_fjsp_set1_v2r8(charge[inr+3]);
+    vdwjidx0A        = 2*vdwtype[inr+0];
+    c6_00            = gmx_fjsp_set1_v2r8(vdwparam[vdwioffset0+vdwjidx0A]);
+    c12_00           = gmx_fjsp_set1_v2r8(vdwparam[vdwioffset0+vdwjidx0A+1]);
+    qq11             = _fjsp_mul_v2r8(iq1,jq1);
+    qq12             = _fjsp_mul_v2r8(iq1,jq2);
+    qq13             = _fjsp_mul_v2r8(iq1,jq3);
+    qq21             = _fjsp_mul_v2r8(iq2,jq1);
+    qq22             = _fjsp_mul_v2r8(iq2,jq2);
+    qq23             = _fjsp_mul_v2r8(iq2,jq3);
+    qq31             = _fjsp_mul_v2r8(iq3,jq1);
+    qq32             = _fjsp_mul_v2r8(iq3,jq2);
+    qq33             = _fjsp_mul_v2r8(iq3,jq3);
+
+    /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+    rcutoff_scalar   = fr->rcoulomb;
+    rcutoff          = gmx_fjsp_set1_v2r8(rcutoff_scalar);
+    rcutoff2         = _fjsp_mul_v2r8(rcutoff,rcutoff);
+
+    rswitch_scalar   = fr->rvdw_switch;
+    rswitch          = gmx_fjsp_set1_v2r8(rswitch_scalar);
+    /* Setup switch parameters */
+    d_scalar         = rcutoff_scalar-rswitch_scalar;
+    d                = gmx_fjsp_set1_v2r8(d_scalar);
+    swV3             = gmx_fjsp_set1_v2r8(-10.0/(d_scalar*d_scalar*d_scalar));
+    swV4             = gmx_fjsp_set1_v2r8( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar));
+    swV5             = gmx_fjsp_set1_v2r8( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
+    swF2             = gmx_fjsp_set1_v2r8(-30.0/(d_scalar*d_scalar*d_scalar));
+    swF3             = gmx_fjsp_set1_v2r8( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar));
+    swF4             = gmx_fjsp_set1_v2r8(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_fjsp_load_shift_and_4rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
+
+        fix0             = _fjsp_setzero_v2r8();
+        fiy0             = _fjsp_setzero_v2r8();
+        fiz0             = _fjsp_setzero_v2r8();
+        fix1             = _fjsp_setzero_v2r8();
+        fiy1             = _fjsp_setzero_v2r8();
+        fiz1             = _fjsp_setzero_v2r8();
+        fix2             = _fjsp_setzero_v2r8();
+        fiy2             = _fjsp_setzero_v2r8();
+        fiz2             = _fjsp_setzero_v2r8();
+        fix3             = _fjsp_setzero_v2r8();
+        fiy3             = _fjsp_setzero_v2r8();
+        fiz3             = _fjsp_setzero_v2r8();
+
+        /* Reset potential sums */
+        velecsum         = _fjsp_setzero_v2r8();
+        vvdwsum          = _fjsp_setzero_v2r8();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_4rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
+                                              &jy2,&jz2,&jx3,&jy3,&jz3);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx11             = _fjsp_sub_v2r8(ix1,jx1);
+            dy11             = _fjsp_sub_v2r8(iy1,jy1);
+            dz11             = _fjsp_sub_v2r8(iz1,jz1);
+            dx12             = _fjsp_sub_v2r8(ix1,jx2);
+            dy12             = _fjsp_sub_v2r8(iy1,jy2);
+            dz12             = _fjsp_sub_v2r8(iz1,jz2);
+            dx13             = _fjsp_sub_v2r8(ix1,jx3);
+            dy13             = _fjsp_sub_v2r8(iy1,jy3);
+            dz13             = _fjsp_sub_v2r8(iz1,jz3);
+            dx21             = _fjsp_sub_v2r8(ix2,jx1);
+            dy21             = _fjsp_sub_v2r8(iy2,jy1);
+            dz21             = _fjsp_sub_v2r8(iz2,jz1);
+            dx22             = _fjsp_sub_v2r8(ix2,jx2);
+            dy22             = _fjsp_sub_v2r8(iy2,jy2);
+            dz22             = _fjsp_sub_v2r8(iz2,jz2);
+            dx23             = _fjsp_sub_v2r8(ix2,jx3);
+            dy23             = _fjsp_sub_v2r8(iy2,jy3);
+            dz23             = _fjsp_sub_v2r8(iz2,jz3);
+            dx31             = _fjsp_sub_v2r8(ix3,jx1);
+            dy31             = _fjsp_sub_v2r8(iy3,jy1);
+            dz31             = _fjsp_sub_v2r8(iz3,jz1);
+            dx32             = _fjsp_sub_v2r8(ix3,jx2);
+            dy32             = _fjsp_sub_v2r8(iy3,jy2);
+            dz32             = _fjsp_sub_v2r8(iz3,jz2);
+            dx33             = _fjsp_sub_v2r8(ix3,jx3);
+            dy33             = _fjsp_sub_v2r8(iy3,jy3);
+            dz33             = _fjsp_sub_v2r8(iz3,jz3);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+            rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+            rsq13            = gmx_fjsp_calc_rsq_v2r8(dx13,dy13,dz13);
+            rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+            rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+            rsq23            = gmx_fjsp_calc_rsq_v2r8(dx23,dy23,dz23);
+            rsq31            = gmx_fjsp_calc_rsq_v2r8(dx31,dy31,dz31);
+            rsq32            = gmx_fjsp_calc_rsq_v2r8(dx32,dy32,dz32);
+            rsq33            = gmx_fjsp_calc_rsq_v2r8(dx33,dy33,dz33);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+            rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+            rinv13           = gmx_fjsp_invsqrt_v2r8(rsq13);
+            rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+            rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+            rinv23           = gmx_fjsp_invsqrt_v2r8(rsq23);
+            rinv31           = gmx_fjsp_invsqrt_v2r8(rsq31);
+            rinv32           = gmx_fjsp_invsqrt_v2r8(rsq32);
+            rinv33           = gmx_fjsp_invsqrt_v2r8(rsq33);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+            rinvsq11         = _fjsp_mul_v2r8(rinv11,rinv11);
+            rinvsq12         = _fjsp_mul_v2r8(rinv12,rinv12);
+            rinvsq13         = _fjsp_mul_v2r8(rinv13,rinv13);
+            rinvsq21         = _fjsp_mul_v2r8(rinv21,rinv21);
+            rinvsq22         = _fjsp_mul_v2r8(rinv22,rinv22);
+            rinvsq23         = _fjsp_mul_v2r8(rinv23,rinv23);
+            rinvsq31         = _fjsp_mul_v2r8(rinv31,rinv31);
+            rinvsq32         = _fjsp_mul_v2r8(rinv32,rinv32);
+            rinvsq33         = _fjsp_mul_v2r8(rinv33,rinv33);
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+            fjx1             = _fjsp_setzero_v2r8();
+            fjy1             = _fjsp_setzero_v2r8();
+            fjz1             = _fjsp_setzero_v2r8();
+            fjx2             = _fjsp_setzero_v2r8();
+            fjy2             = _fjsp_setzero_v2r8();
+            fjz2             = _fjsp_setzero_v2r8();
+            fjx3             = _fjsp_setzero_v2r8();
+            fjy3             = _fjsp_setzero_v2r8();
+            fjz3             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
+            {
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+            vvdw6            = _fjsp_mul_v2r8(c6_00,rinvsix);
+            vvdw12           = _fjsp_mul_v2r8(c12_00,_fjsp_mul_v2r8(rinvsix,rinvsix));
+            vvdw             = _fjsp_msub_v2r8( vvdw12,one_twelfth, _fjsp_mul_v2r8(vvdw6,one_sixth) );
+            fvdw             = _fjsp_mul_v2r8(_fjsp_sub_v2r8(vvdw12,vvdw6),rinvsq00);
+
+            d                = _fjsp_sub_v2r8(r00,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            fvdw             = _fjsp_msub_v2r8( fvdw,sw , _fjsp_mul_v2r8(rinv00,_fjsp_mul_v2r8(vvdw,dsw)) );
+            vvdw             = _fjsp_mul_v2r8(vvdw,sw);
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            vvdw             = _fjsp_and_v2r8(vvdw,cutoff_mask);
+            vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
+
+            fscal            = fvdw;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq11,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq11,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq11,rinv11),crf));
+            felec            = _fjsp_mul_v2r8(qq11,_fjsp_msub_v2r8(rinv11,rinvsq11,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq11,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+            
+            fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq12,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq12,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq12,rinv12),crf));
+            felec            = _fjsp_mul_v2r8(qq12,_fjsp_msub_v2r8(rinv12,rinvsq12,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq12,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+            
+            fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq13,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq13,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq13,rinv13),crf));
+            felec            = _fjsp_mul_v2r8(qq13,_fjsp_msub_v2r8(rinv13,rinvsq13,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq13,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx13,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy13,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz13,fscal,fiz1);
+            
+            fjx3             = _fjsp_madd_v2r8(dx13,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy13,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz13,fscal,fjz3);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq21,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq21,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq21,rinv21),crf));
+            felec            = _fjsp_mul_v2r8(qq21,_fjsp_msub_v2r8(rinv21,rinvsq21,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq21,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+            
+            fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq22,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq22,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq22,rinv22),crf));
+            felec            = _fjsp_mul_v2r8(qq22,_fjsp_msub_v2r8(rinv22,rinvsq22,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq22,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+            
+            fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq23,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq23,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq23,rinv23),crf));
+            felec            = _fjsp_mul_v2r8(qq23,_fjsp_msub_v2r8(rinv23,rinvsq23,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq23,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx23,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy23,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz23,fscal,fiz2);
+            
+            fjx3             = _fjsp_madd_v2r8(dx23,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy23,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz23,fscal,fjz3);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq31,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq31,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq31,rinv31),crf));
+            felec            = _fjsp_mul_v2r8(qq31,_fjsp_msub_v2r8(rinv31,rinvsq31,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq31,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx31,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy31,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz31,fscal,fiz3);
+            
+            fjx1             = _fjsp_madd_v2r8(dx31,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy31,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz31,fscal,fjz1);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq32,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq32,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq32,rinv32),crf));
+            felec            = _fjsp_mul_v2r8(qq32,_fjsp_msub_v2r8(rinv32,rinvsq32,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq32,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx32,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy32,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz32,fscal,fiz3);
+            
+            fjx2             = _fjsp_madd_v2r8(dx32,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy32,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz32,fscal,fjz2);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq33,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq33,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq33,rinv33),crf));
+            felec            = _fjsp_mul_v2r8(qq33,_fjsp_msub_v2r8(rinv33,rinvsq33,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq33,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx33,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy33,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz33,fscal,fiz3);
+            
+            fjx3             = _fjsp_madd_v2r8(dx33,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy33,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz33,fscal,fjz3);
+
+            }
+
+            gmx_fjsp_decrement_4rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+
+            /* Inner loop uses 416 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_4rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
+                                              &jy2,&jz2,&jx3,&jy3,&jz3);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx11             = _fjsp_sub_v2r8(ix1,jx1);
+            dy11             = _fjsp_sub_v2r8(iy1,jy1);
+            dz11             = _fjsp_sub_v2r8(iz1,jz1);
+            dx12             = _fjsp_sub_v2r8(ix1,jx2);
+            dy12             = _fjsp_sub_v2r8(iy1,jy2);
+            dz12             = _fjsp_sub_v2r8(iz1,jz2);
+            dx13             = _fjsp_sub_v2r8(ix1,jx3);
+            dy13             = _fjsp_sub_v2r8(iy1,jy3);
+            dz13             = _fjsp_sub_v2r8(iz1,jz3);
+            dx21             = _fjsp_sub_v2r8(ix2,jx1);
+            dy21             = _fjsp_sub_v2r8(iy2,jy1);
+            dz21             = _fjsp_sub_v2r8(iz2,jz1);
+            dx22             = _fjsp_sub_v2r8(ix2,jx2);
+            dy22             = _fjsp_sub_v2r8(iy2,jy2);
+            dz22             = _fjsp_sub_v2r8(iz2,jz2);
+            dx23             = _fjsp_sub_v2r8(ix2,jx3);
+            dy23             = _fjsp_sub_v2r8(iy2,jy3);
+            dz23             = _fjsp_sub_v2r8(iz2,jz3);
+            dx31             = _fjsp_sub_v2r8(ix3,jx1);
+            dy31             = _fjsp_sub_v2r8(iy3,jy1);
+            dz31             = _fjsp_sub_v2r8(iz3,jz1);
+            dx32             = _fjsp_sub_v2r8(ix3,jx2);
+            dy32             = _fjsp_sub_v2r8(iy3,jy2);
+            dz32             = _fjsp_sub_v2r8(iz3,jz2);
+            dx33             = _fjsp_sub_v2r8(ix3,jx3);
+            dy33             = _fjsp_sub_v2r8(iy3,jy3);
+            dz33             = _fjsp_sub_v2r8(iz3,jz3);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+            rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+            rsq13            = gmx_fjsp_calc_rsq_v2r8(dx13,dy13,dz13);
+            rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+            rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+            rsq23            = gmx_fjsp_calc_rsq_v2r8(dx23,dy23,dz23);
+            rsq31            = gmx_fjsp_calc_rsq_v2r8(dx31,dy31,dz31);
+            rsq32            = gmx_fjsp_calc_rsq_v2r8(dx32,dy32,dz32);
+            rsq33            = gmx_fjsp_calc_rsq_v2r8(dx33,dy33,dz33);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+            rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+            rinv13           = gmx_fjsp_invsqrt_v2r8(rsq13);
+            rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+            rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+            rinv23           = gmx_fjsp_invsqrt_v2r8(rsq23);
+            rinv31           = gmx_fjsp_invsqrt_v2r8(rsq31);
+            rinv32           = gmx_fjsp_invsqrt_v2r8(rsq32);
+            rinv33           = gmx_fjsp_invsqrt_v2r8(rsq33);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+            rinvsq11         = _fjsp_mul_v2r8(rinv11,rinv11);
+            rinvsq12         = _fjsp_mul_v2r8(rinv12,rinv12);
+            rinvsq13         = _fjsp_mul_v2r8(rinv13,rinv13);
+            rinvsq21         = _fjsp_mul_v2r8(rinv21,rinv21);
+            rinvsq22         = _fjsp_mul_v2r8(rinv22,rinv22);
+            rinvsq23         = _fjsp_mul_v2r8(rinv23,rinv23);
+            rinvsq31         = _fjsp_mul_v2r8(rinv31,rinv31);
+            rinvsq32         = _fjsp_mul_v2r8(rinv32,rinv32);
+            rinvsq33         = _fjsp_mul_v2r8(rinv33,rinv33);
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+            fjx1             = _fjsp_setzero_v2r8();
+            fjy1             = _fjsp_setzero_v2r8();
+            fjz1             = _fjsp_setzero_v2r8();
+            fjx2             = _fjsp_setzero_v2r8();
+            fjy2             = _fjsp_setzero_v2r8();
+            fjz2             = _fjsp_setzero_v2r8();
+            fjx3             = _fjsp_setzero_v2r8();
+            fjy3             = _fjsp_setzero_v2r8();
+            fjz3             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
+            {
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+            vvdw6            = _fjsp_mul_v2r8(c6_00,rinvsix);
+            vvdw12           = _fjsp_mul_v2r8(c12_00,_fjsp_mul_v2r8(rinvsix,rinvsix));
+            vvdw             = _fjsp_msub_v2r8( vvdw12,one_twelfth, _fjsp_mul_v2r8(vvdw6,one_sixth) );
+            fvdw             = _fjsp_mul_v2r8(_fjsp_sub_v2r8(vvdw12,vvdw6),rinvsq00);
+
+            d                = _fjsp_sub_v2r8(r00,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            fvdw             = _fjsp_msub_v2r8( fvdw,sw , _fjsp_mul_v2r8(rinv00,_fjsp_mul_v2r8(vvdw,dsw)) );
+            vvdw             = _fjsp_mul_v2r8(vvdw,sw);
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            vvdw             = _fjsp_and_v2r8(vvdw,cutoff_mask);
+            vvdw             = _fjsp_unpacklo_v2r8(vvdw,_fjsp_setzero_v2r8());
+            vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
+
+            fscal            = fvdw;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq11,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq11,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq11,rinv11),crf));
+            felec            = _fjsp_mul_v2r8(qq11,_fjsp_msub_v2r8(rinv11,rinvsq11,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq11,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+            
+            fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq12,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq12,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq12,rinv12),crf));
+            felec            = _fjsp_mul_v2r8(qq12,_fjsp_msub_v2r8(rinv12,rinvsq12,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq12,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+            
+            fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq13,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq13,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq13,rinv13),crf));
+            felec            = _fjsp_mul_v2r8(qq13,_fjsp_msub_v2r8(rinv13,rinvsq13,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq13,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx13,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy13,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz13,fscal,fiz1);
+            
+            fjx3             = _fjsp_madd_v2r8(dx13,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy13,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz13,fscal,fjz3);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq21,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq21,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq21,rinv21),crf));
+            felec            = _fjsp_mul_v2r8(qq21,_fjsp_msub_v2r8(rinv21,rinvsq21,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq21,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+            
+            fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq22,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq22,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq22,rinv22),crf));
+            felec            = _fjsp_mul_v2r8(qq22,_fjsp_msub_v2r8(rinv22,rinvsq22,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq22,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+            
+            fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq23,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq23,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq23,rinv23),crf));
+            felec            = _fjsp_mul_v2r8(qq23,_fjsp_msub_v2r8(rinv23,rinvsq23,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq23,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx23,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy23,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz23,fscal,fiz2);
+            
+            fjx3             = _fjsp_madd_v2r8(dx23,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy23,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz23,fscal,fjz3);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq31,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq31,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq31,rinv31),crf));
+            felec            = _fjsp_mul_v2r8(qq31,_fjsp_msub_v2r8(rinv31,rinvsq31,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq31,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx31,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy31,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz31,fscal,fiz3);
+            
+            fjx1             = _fjsp_madd_v2r8(dx31,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy31,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz31,fscal,fjz1);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq32,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq32,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq32,rinv32),crf));
+            felec            = _fjsp_mul_v2r8(qq32,_fjsp_msub_v2r8(rinv32,rinvsq32,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq32,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx32,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy32,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz32,fscal,fiz3);
+            
+            fjx2             = _fjsp_madd_v2r8(dx32,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy32,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz32,fscal,fjz2);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq33,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq33,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq33,rinv33),crf));
+            felec            = _fjsp_mul_v2r8(qq33,_fjsp_msub_v2r8(rinv33,rinvsq33,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq33,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx33,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy33,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz33,fscal,fiz3);
+            
+            fjx3             = _fjsp_madd_v2r8(dx33,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy33,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz33,fscal,fjz3);
+
+            }
+
+            gmx_fjsp_decrement_4rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+
+            /* Inner loop uses 416 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_fjsp_update_iforce_4atom_swizzle_v2r8(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        ggid                        = gid[iidx];
+        /* Update potential energies */
+        gmx_fjsp_update_1pot_v2r8(velecsum,kernel_data->energygrp_elec+ggid);
+        gmx_fjsp_update_1pot_v2r8(vvdwsum,kernel_data->energygrp_vdw+ggid);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 26 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_VF,outeriter*26 + inneriter*416);
+}
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecRFCut_VdwLJSw_GeomW4W4_F_sparc64_hpc_ace_double
+ * Electrostatics interaction: ReactionField
+ * VdW interaction:            LennardJones
+ * Geometry:                   Water4-Water4
+ * Calculate force/pot:        Force
+ */
+void
+nb_kernel_ElecRFCut_VdwLJSw_GeomW4W4_F_sparc64_hpc_ace_double
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwioffset3;
+    _fjsp_v2r8       ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+    int              vdwjidx0A,vdwjidx0B;
+    _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    int              vdwjidx1A,vdwjidx1B;
+    _fjsp_v2r8       jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+    int              vdwjidx2A,vdwjidx2B;
+    _fjsp_v2r8       jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+    int              vdwjidx3A,vdwjidx3B;
+    _fjsp_v2r8       jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
+    _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    _fjsp_v2r8       dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+    _fjsp_v2r8       dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+    _fjsp_v2r8       dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
+    _fjsp_v2r8       dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+    _fjsp_v2r8       dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+    _fjsp_v2r8       dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
+    _fjsp_v2r8       dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
+    _fjsp_v2r8       dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
+    _fjsp_v2r8       dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
+    _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
+    _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
+    _fjsp_v2r8       rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
+    real             rswitch_scalar,d_scalar;
+    _fjsp_v2r8       itab_tmp;
+    _fjsp_v2r8       dummy_mask,cutoff_mask;
+    _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+    _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+    union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    krf              = gmx_fjsp_set1_v2r8(fr->ic->k_rf);
+    krf2             = gmx_fjsp_set1_v2r8(fr->ic->k_rf*2.0);
+    crf              = gmx_fjsp_set1_v2r8(fr->ic->c_rf);
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+    iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+    iq3              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+3]));
+    vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+    jq1              = gmx_fjsp_set1_v2r8(charge[inr+1]);
+    jq2              = gmx_fjsp_set1_v2r8(charge[inr+2]);
+    jq3              = gmx_fjsp_set1_v2r8(charge[inr+3]);
+    vdwjidx0A        = 2*vdwtype[inr+0];
+    c6_00            = gmx_fjsp_set1_v2r8(vdwparam[vdwioffset0+vdwjidx0A]);
+    c12_00           = gmx_fjsp_set1_v2r8(vdwparam[vdwioffset0+vdwjidx0A+1]);
+    qq11             = _fjsp_mul_v2r8(iq1,jq1);
+    qq12             = _fjsp_mul_v2r8(iq1,jq2);
+    qq13             = _fjsp_mul_v2r8(iq1,jq3);
+    qq21             = _fjsp_mul_v2r8(iq2,jq1);
+    qq22             = _fjsp_mul_v2r8(iq2,jq2);
+    qq23             = _fjsp_mul_v2r8(iq2,jq3);
+    qq31             = _fjsp_mul_v2r8(iq3,jq1);
+    qq32             = _fjsp_mul_v2r8(iq3,jq2);
+    qq33             = _fjsp_mul_v2r8(iq3,jq3);
+
+    /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+    rcutoff_scalar   = fr->rcoulomb;
+    rcutoff          = gmx_fjsp_set1_v2r8(rcutoff_scalar);
+    rcutoff2         = _fjsp_mul_v2r8(rcutoff,rcutoff);
+
+    rswitch_scalar   = fr->rvdw_switch;
+    rswitch          = gmx_fjsp_set1_v2r8(rswitch_scalar);
+    /* Setup switch parameters */
+    d_scalar         = rcutoff_scalar-rswitch_scalar;
+    d                = gmx_fjsp_set1_v2r8(d_scalar);
+    swV3             = gmx_fjsp_set1_v2r8(-10.0/(d_scalar*d_scalar*d_scalar));
+    swV4             = gmx_fjsp_set1_v2r8( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar));
+    swV5             = gmx_fjsp_set1_v2r8( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
+    swF2             = gmx_fjsp_set1_v2r8(-30.0/(d_scalar*d_scalar*d_scalar));
+    swF3             = gmx_fjsp_set1_v2r8( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar));
+    swF4             = gmx_fjsp_set1_v2r8(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_fjsp_load_shift_and_4rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
+
+        fix0             = _fjsp_setzero_v2r8();
+        fiy0             = _fjsp_setzero_v2r8();
+        fiz0             = _fjsp_setzero_v2r8();
+        fix1             = _fjsp_setzero_v2r8();
+        fiy1             = _fjsp_setzero_v2r8();
+        fiz1             = _fjsp_setzero_v2r8();
+        fix2             = _fjsp_setzero_v2r8();
+        fiy2             = _fjsp_setzero_v2r8();
+        fiz2             = _fjsp_setzero_v2r8();
+        fix3             = _fjsp_setzero_v2r8();
+        fiy3             = _fjsp_setzero_v2r8();
+        fiz3             = _fjsp_setzero_v2r8();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_4rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
+                                              &jy2,&jz2,&jx3,&jy3,&jz3);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx11             = _fjsp_sub_v2r8(ix1,jx1);
+            dy11             = _fjsp_sub_v2r8(iy1,jy1);
+            dz11             = _fjsp_sub_v2r8(iz1,jz1);
+            dx12             = _fjsp_sub_v2r8(ix1,jx2);
+            dy12             = _fjsp_sub_v2r8(iy1,jy2);
+            dz12             = _fjsp_sub_v2r8(iz1,jz2);
+            dx13             = _fjsp_sub_v2r8(ix1,jx3);
+            dy13             = _fjsp_sub_v2r8(iy1,jy3);
+            dz13             = _fjsp_sub_v2r8(iz1,jz3);
+            dx21             = _fjsp_sub_v2r8(ix2,jx1);
+            dy21             = _fjsp_sub_v2r8(iy2,jy1);
+            dz21             = _fjsp_sub_v2r8(iz2,jz1);
+            dx22             = _fjsp_sub_v2r8(ix2,jx2);
+            dy22             = _fjsp_sub_v2r8(iy2,jy2);
+            dz22             = _fjsp_sub_v2r8(iz2,jz2);
+            dx23             = _fjsp_sub_v2r8(ix2,jx3);
+            dy23             = _fjsp_sub_v2r8(iy2,jy3);
+            dz23             = _fjsp_sub_v2r8(iz2,jz3);
+            dx31             = _fjsp_sub_v2r8(ix3,jx1);
+            dy31             = _fjsp_sub_v2r8(iy3,jy1);
+            dz31             = _fjsp_sub_v2r8(iz3,jz1);
+            dx32             = _fjsp_sub_v2r8(ix3,jx2);
+            dy32             = _fjsp_sub_v2r8(iy3,jy2);
+            dz32             = _fjsp_sub_v2r8(iz3,jz2);
+            dx33             = _fjsp_sub_v2r8(ix3,jx3);
+            dy33             = _fjsp_sub_v2r8(iy3,jy3);
+            dz33             = _fjsp_sub_v2r8(iz3,jz3);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+            rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+            rsq13            = gmx_fjsp_calc_rsq_v2r8(dx13,dy13,dz13);
+            rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+            rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+            rsq23            = gmx_fjsp_calc_rsq_v2r8(dx23,dy23,dz23);
+            rsq31            = gmx_fjsp_calc_rsq_v2r8(dx31,dy31,dz31);
+            rsq32            = gmx_fjsp_calc_rsq_v2r8(dx32,dy32,dz32);
+            rsq33            = gmx_fjsp_calc_rsq_v2r8(dx33,dy33,dz33);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+            rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+            rinv13           = gmx_fjsp_invsqrt_v2r8(rsq13);
+            rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+            rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+            rinv23           = gmx_fjsp_invsqrt_v2r8(rsq23);
+            rinv31           = gmx_fjsp_invsqrt_v2r8(rsq31);
+            rinv32           = gmx_fjsp_invsqrt_v2r8(rsq32);
+            rinv33           = gmx_fjsp_invsqrt_v2r8(rsq33);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+            rinvsq11         = _fjsp_mul_v2r8(rinv11,rinv11);
+            rinvsq12         = _fjsp_mul_v2r8(rinv12,rinv12);
+            rinvsq13         = _fjsp_mul_v2r8(rinv13,rinv13);
+            rinvsq21         = _fjsp_mul_v2r8(rinv21,rinv21);
+            rinvsq22         = _fjsp_mul_v2r8(rinv22,rinv22);
+            rinvsq23         = _fjsp_mul_v2r8(rinv23,rinv23);
+            rinvsq31         = _fjsp_mul_v2r8(rinv31,rinv31);
+            rinvsq32         = _fjsp_mul_v2r8(rinv32,rinv32);
+            rinvsq33         = _fjsp_mul_v2r8(rinv33,rinv33);
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+            fjx1             = _fjsp_setzero_v2r8();
+            fjy1             = _fjsp_setzero_v2r8();
+            fjz1             = _fjsp_setzero_v2r8();
+            fjx2             = _fjsp_setzero_v2r8();
+            fjy2             = _fjsp_setzero_v2r8();
+            fjz2             = _fjsp_setzero_v2r8();
+            fjx3             = _fjsp_setzero_v2r8();
+            fjy3             = _fjsp_setzero_v2r8();
+            fjz3             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
+            {
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+            vvdw6            = _fjsp_mul_v2r8(c6_00,rinvsix);
+            vvdw12           = _fjsp_mul_v2r8(c12_00,_fjsp_mul_v2r8(rinvsix,rinvsix));
+            vvdw             = _fjsp_msub_v2r8( vvdw12,one_twelfth, _fjsp_mul_v2r8(vvdw6,one_sixth) );
+            fvdw             = _fjsp_mul_v2r8(_fjsp_sub_v2r8(vvdw12,vvdw6),rinvsq00);
+
+            d                = _fjsp_sub_v2r8(r00,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            fvdw             = _fjsp_msub_v2r8( fvdw,sw , _fjsp_mul_v2r8(rinv00,_fjsp_mul_v2r8(vvdw,dsw)) );
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
+
+            fscal            = fvdw;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq11,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq11,_fjsp_msub_v2r8(rinv11,rinvsq11,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq11,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+            
+            fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq12,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq12,_fjsp_msub_v2r8(rinv12,rinvsq12,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq12,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+            
+            fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq13,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq13,_fjsp_msub_v2r8(rinv13,rinvsq13,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq13,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx13,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy13,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz13,fscal,fiz1);
+            
+            fjx3             = _fjsp_madd_v2r8(dx13,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy13,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz13,fscal,fjz3);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq21,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq21,_fjsp_msub_v2r8(rinv21,rinvsq21,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq21,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+            
+            fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq22,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq22,_fjsp_msub_v2r8(rinv22,rinvsq22,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq22,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+            
+            fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq23,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq23,_fjsp_msub_v2r8(rinv23,rinvsq23,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq23,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx23,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy23,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz23,fscal,fiz2);
+            
+            fjx3             = _fjsp_madd_v2r8(dx23,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy23,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz23,fscal,fjz3);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq31,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq31,_fjsp_msub_v2r8(rinv31,rinvsq31,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq31,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx31,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy31,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz31,fscal,fiz3);
+            
+            fjx1             = _fjsp_madd_v2r8(dx31,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy31,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz31,fscal,fjz1);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq32,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq32,_fjsp_msub_v2r8(rinv32,rinvsq32,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq32,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx32,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy32,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz32,fscal,fiz3);
+            
+            fjx2             = _fjsp_madd_v2r8(dx32,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy32,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz32,fscal,fjz2);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq33,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq33,_fjsp_msub_v2r8(rinv33,rinvsq33,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq33,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx33,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy33,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz33,fscal,fiz3);
+            
+            fjx3             = _fjsp_madd_v2r8(dx33,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy33,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz33,fscal,fjz3);
+
+            }
+
+            gmx_fjsp_decrement_4rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+
+            /* Inner loop uses 359 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_4rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
+                                              &jy2,&jz2,&jx3,&jy3,&jz3);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx11             = _fjsp_sub_v2r8(ix1,jx1);
+            dy11             = _fjsp_sub_v2r8(iy1,jy1);
+            dz11             = _fjsp_sub_v2r8(iz1,jz1);
+            dx12             = _fjsp_sub_v2r8(ix1,jx2);
+            dy12             = _fjsp_sub_v2r8(iy1,jy2);
+            dz12             = _fjsp_sub_v2r8(iz1,jz2);
+            dx13             = _fjsp_sub_v2r8(ix1,jx3);
+            dy13             = _fjsp_sub_v2r8(iy1,jy3);
+            dz13             = _fjsp_sub_v2r8(iz1,jz3);
+            dx21             = _fjsp_sub_v2r8(ix2,jx1);
+            dy21             = _fjsp_sub_v2r8(iy2,jy1);
+            dz21             = _fjsp_sub_v2r8(iz2,jz1);
+            dx22             = _fjsp_sub_v2r8(ix2,jx2);
+            dy22             = _fjsp_sub_v2r8(iy2,jy2);
+            dz22             = _fjsp_sub_v2r8(iz2,jz2);
+            dx23             = _fjsp_sub_v2r8(ix2,jx3);
+            dy23             = _fjsp_sub_v2r8(iy2,jy3);
+            dz23             = _fjsp_sub_v2r8(iz2,jz3);
+            dx31             = _fjsp_sub_v2r8(ix3,jx1);
+            dy31             = _fjsp_sub_v2r8(iy3,jy1);
+            dz31             = _fjsp_sub_v2r8(iz3,jz1);
+            dx32             = _fjsp_sub_v2r8(ix3,jx2);
+            dy32             = _fjsp_sub_v2r8(iy3,jy2);
+            dz32             = _fjsp_sub_v2r8(iz3,jz2);
+            dx33             = _fjsp_sub_v2r8(ix3,jx3);
+            dy33             = _fjsp_sub_v2r8(iy3,jy3);
+            dz33             = _fjsp_sub_v2r8(iz3,jz3);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+            rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+            rsq13            = gmx_fjsp_calc_rsq_v2r8(dx13,dy13,dz13);
+            rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+            rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+            rsq23            = gmx_fjsp_calc_rsq_v2r8(dx23,dy23,dz23);
+            rsq31            = gmx_fjsp_calc_rsq_v2r8(dx31,dy31,dz31);
+            rsq32            = gmx_fjsp_calc_rsq_v2r8(dx32,dy32,dz32);
+            rsq33            = gmx_fjsp_calc_rsq_v2r8(dx33,dy33,dz33);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+            rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+            rinv13           = gmx_fjsp_invsqrt_v2r8(rsq13);
+            rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+            rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+            rinv23           = gmx_fjsp_invsqrt_v2r8(rsq23);
+            rinv31           = gmx_fjsp_invsqrt_v2r8(rsq31);
+            rinv32           = gmx_fjsp_invsqrt_v2r8(rsq32);
+            rinv33           = gmx_fjsp_invsqrt_v2r8(rsq33);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+            rinvsq11         = _fjsp_mul_v2r8(rinv11,rinv11);
+            rinvsq12         = _fjsp_mul_v2r8(rinv12,rinv12);
+            rinvsq13         = _fjsp_mul_v2r8(rinv13,rinv13);
+            rinvsq21         = _fjsp_mul_v2r8(rinv21,rinv21);
+            rinvsq22         = _fjsp_mul_v2r8(rinv22,rinv22);
+            rinvsq23         = _fjsp_mul_v2r8(rinv23,rinv23);
+            rinvsq31         = _fjsp_mul_v2r8(rinv31,rinv31);
+            rinvsq32         = _fjsp_mul_v2r8(rinv32,rinv32);
+            rinvsq33         = _fjsp_mul_v2r8(rinv33,rinv33);
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+            fjx1             = _fjsp_setzero_v2r8();
+            fjy1             = _fjsp_setzero_v2r8();
+            fjz1             = _fjsp_setzero_v2r8();
+            fjx2             = _fjsp_setzero_v2r8();
+            fjy2             = _fjsp_setzero_v2r8();
+            fjz2             = _fjsp_setzero_v2r8();
+            fjx3             = _fjsp_setzero_v2r8();
+            fjy3             = _fjsp_setzero_v2r8();
+            fjz3             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
+            {
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+            vvdw6            = _fjsp_mul_v2r8(c6_00,rinvsix);
+            vvdw12           = _fjsp_mul_v2r8(c12_00,_fjsp_mul_v2r8(rinvsix,rinvsix));
+            vvdw             = _fjsp_msub_v2r8( vvdw12,one_twelfth, _fjsp_mul_v2r8(vvdw6,one_sixth) );
+            fvdw             = _fjsp_mul_v2r8(_fjsp_sub_v2r8(vvdw12,vvdw6),rinvsq00);
+
+            d                = _fjsp_sub_v2r8(r00,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            fvdw             = _fjsp_msub_v2r8( fvdw,sw , _fjsp_mul_v2r8(rinv00,_fjsp_mul_v2r8(vvdw,dsw)) );
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
+
+            fscal            = fvdw;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq11,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq11,_fjsp_msub_v2r8(rinv11,rinvsq11,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq11,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+            
+            fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq12,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq12,_fjsp_msub_v2r8(rinv12,rinvsq12,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq12,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+            
+            fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq13,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq13,_fjsp_msub_v2r8(rinv13,rinvsq13,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq13,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx13,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy13,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz13,fscal,fiz1);
+            
+            fjx3             = _fjsp_madd_v2r8(dx13,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy13,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz13,fscal,fjz3);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq21,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq21,_fjsp_msub_v2r8(rinv21,rinvsq21,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq21,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+            
+            fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq22,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq22,_fjsp_msub_v2r8(rinv22,rinvsq22,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq22,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+            
+            fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq23,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq23,_fjsp_msub_v2r8(rinv23,rinvsq23,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq23,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx23,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy23,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz23,fscal,fiz2);
+            
+            fjx3             = _fjsp_madd_v2r8(dx23,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy23,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz23,fscal,fjz3);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq31,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq31,_fjsp_msub_v2r8(rinv31,rinvsq31,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq31,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx31,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy31,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz31,fscal,fiz3);
+            
+            fjx1             = _fjsp_madd_v2r8(dx31,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy31,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz31,fscal,fjz1);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq32,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq32,_fjsp_msub_v2r8(rinv32,rinvsq32,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq32,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx32,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy32,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz32,fscal,fiz3);
+            
+            fjx2             = _fjsp_madd_v2r8(dx32,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy32,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz32,fscal,fjz2);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq33,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq33,_fjsp_msub_v2r8(rinv33,rinvsq33,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq33,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx33,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy33,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz33,fscal,fiz3);
+            
+            fjx3             = _fjsp_madd_v2r8(dx33,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy33,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz33,fscal,fjz3);
+
+            }
+
+            gmx_fjsp_decrement_4rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+
+            /* Inner loop uses 359 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_fjsp_update_iforce_4atom_swizzle_v2r8(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 24 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_F,outeriter*24 + inneriter*359);
+}
diff --git a/src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecRFCut_VdwNone_GeomP1P1_sparc64_hpc_ace_double.c b/src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecRFCut_VdwNone_GeomP1P1_sparc64_hpc_ace_double.c
new file mode 100644 (file)
index 0000000..da6aa20
--- /dev/null
@@ -0,0 +1,534 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2012, by the GROMACS development team, led by
+ * David van der Spoel, Berk Hess, Erik Lindahl, and including many
+ * others, as listed in the AUTHORS file in the top-level source
+ * directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*
+ * Note: this file was generated by the GROMACS sparc64_hpc_ace_double kernel generator.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "kernelutil_sparc64_hpc_ace_double.h"
+
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecRFCut_VdwNone_GeomP1P1_VF_sparc64_hpc_ace_double
+ * Electrostatics interaction: ReactionField
+ * VdW interaction:            None
+ * Geometry:                   Particle-Particle
+ * Calculate force/pot:        PotentialAndForce
+ */
+void
+nb_kernel_ElecRFCut_VdwNone_GeomP1P1_VF_sparc64_hpc_ace_double
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwjidx0A,vdwjidx0B;
+    _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    _fjsp_v2r8       itab_tmp;
+    _fjsp_v2r8       dummy_mask,cutoff_mask;
+    _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+    _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+    union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    krf              = gmx_fjsp_set1_v2r8(fr->ic->k_rf);
+    krf2             = gmx_fjsp_set1_v2r8(fr->ic->k_rf*2.0);
+    crf              = gmx_fjsp_set1_v2r8(fr->ic->c_rf);
+
+    /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+    rcutoff_scalar   = fr->rcoulomb;
+    rcutoff          = gmx_fjsp_set1_v2r8(rcutoff_scalar);
+    rcutoff2         = _fjsp_mul_v2r8(rcutoff,rcutoff);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_fjsp_load_shift_and_1rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,&ix0,&iy0,&iz0);
+
+        fix0             = _fjsp_setzero_v2r8();
+        fiy0             = _fjsp_setzero_v2r8();
+        fiz0             = _fjsp_setzero_v2r8();
+
+        /* Load parameters for i particles */
+        iq0              = _fjsp_mul_v2r8(facel,gmx_fjsp_load1_v2r8(charge+inr+0));
+
+        /* Reset potential sums */
+        velecsum         = _fjsp_setzero_v2r8();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_fjsp_load_2real_swizzle_v2r8(charge+jnrA+0,charge+jnrB+0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
+            {
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _fjsp_mul_v2r8(iq0,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq00,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq00,rinv00),crf));
+            felec            = _fjsp_mul_v2r8(qq00,_fjsp_msub_v2r8(rinv00,rinvsq00,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            gmx_fjsp_decrement_fma_1rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fscal,dx00,dy00,dz00);
+
+            }
+
+            /* Inner loop uses 39 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+
+            /* Load parameters for j particles */
+            jq0              = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),charge+jnrA+0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
+            {
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _fjsp_mul_v2r8(iq0,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq00,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq00,rinv00),crf));
+            felec            = _fjsp_mul_v2r8(qq00,_fjsp_msub_v2r8(rinv00,rinvsq00,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            gmx_fjsp_decrement_fma_1rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fscal,dx00,dy00,dz00);
+
+            }
+
+            /* Inner loop uses 39 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_fjsp_update_iforce_1atom_swizzle_v2r8(fix0,fiy0,fiz0,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        ggid                        = gid[iidx];
+        /* Update potential energies */
+        gmx_fjsp_update_1pot_v2r8(velecsum,kernel_data->energygrp_elec+ggid);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 8 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VF,outeriter*8 + inneriter*39);
+}
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecRFCut_VdwNone_GeomP1P1_F_sparc64_hpc_ace_double
+ * Electrostatics interaction: ReactionField
+ * VdW interaction:            None
+ * Geometry:                   Particle-Particle
+ * Calculate force/pot:        Force
+ */
+void
+nb_kernel_ElecRFCut_VdwNone_GeomP1P1_F_sparc64_hpc_ace_double
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwjidx0A,vdwjidx0B;
+    _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    _fjsp_v2r8       itab_tmp;
+    _fjsp_v2r8       dummy_mask,cutoff_mask;
+    _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+    _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+    union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    krf              = gmx_fjsp_set1_v2r8(fr->ic->k_rf);
+    krf2             = gmx_fjsp_set1_v2r8(fr->ic->k_rf*2.0);
+    crf              = gmx_fjsp_set1_v2r8(fr->ic->c_rf);
+
+    /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+    rcutoff_scalar   = fr->rcoulomb;
+    rcutoff          = gmx_fjsp_set1_v2r8(rcutoff_scalar);
+    rcutoff2         = _fjsp_mul_v2r8(rcutoff,rcutoff);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_fjsp_load_shift_and_1rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,&ix0,&iy0,&iz0);
+
+        fix0             = _fjsp_setzero_v2r8();
+        fiy0             = _fjsp_setzero_v2r8();
+        fiz0             = _fjsp_setzero_v2r8();
+
+        /* Load parameters for i particles */
+        iq0              = _fjsp_mul_v2r8(facel,gmx_fjsp_load1_v2r8(charge+inr+0));
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_fjsp_load_2real_swizzle_v2r8(charge+jnrA+0,charge+jnrB+0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
+            {
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _fjsp_mul_v2r8(iq0,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq00,_fjsp_msub_v2r8(rinv00,rinvsq00,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            gmx_fjsp_decrement_fma_1rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fscal,dx00,dy00,dz00);
+
+            }
+
+            /* Inner loop uses 33 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+
+            /* Load parameters for j particles */
+            jq0              = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),charge+jnrA+0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
+            {
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _fjsp_mul_v2r8(iq0,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq00,_fjsp_msub_v2r8(rinv00,rinvsq00,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            gmx_fjsp_decrement_fma_1rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fscal,dx00,dy00,dz00);
+
+            }
+
+            /* Inner loop uses 33 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_fjsp_update_iforce_1atom_swizzle_v2r8(fix0,fiy0,fiz0,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 7 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_F,outeriter*7 + inneriter*33);
+}
diff --git a/src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecRFCut_VdwNone_GeomW3P1_sparc64_hpc_ace_double.c b/src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecRFCut_VdwNone_GeomW3P1_sparc64_hpc_ace_double.c
new file mode 100644 (file)
index 0000000..8556bfe
--- /dev/null
@@ -0,0 +1,916 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2012, by the GROMACS development team, led by
+ * David van der Spoel, Berk Hess, Erik Lindahl, and including many
+ * others, as listed in the AUTHORS file in the top-level source
+ * directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*
+ * Note: this file was generated by the GROMACS sparc64_hpc_ace_double kernel generator.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "kernelutil_sparc64_hpc_ace_double.h"
+
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecRFCut_VdwNone_GeomW3P1_VF_sparc64_hpc_ace_double
+ * Electrostatics interaction: ReactionField
+ * VdW interaction:            None
+ * Geometry:                   Water3-Particle
+ * Calculate force/pot:        PotentialAndForce
+ */
+void
+nb_kernel_ElecRFCut_VdwNone_GeomW3P1_VF_sparc64_hpc_ace_double
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwjidx0A,vdwjidx0B;
+    _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    _fjsp_v2r8       dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    _fjsp_v2r8       dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    _fjsp_v2r8       itab_tmp;
+    _fjsp_v2r8       dummy_mask,cutoff_mask;
+    _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+    _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+    union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    krf              = gmx_fjsp_set1_v2r8(fr->ic->k_rf);
+    krf2             = gmx_fjsp_set1_v2r8(fr->ic->k_rf*2.0);
+    crf              = gmx_fjsp_set1_v2r8(fr->ic->c_rf);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq0              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+0]));
+    iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+    iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+
+    /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+    rcutoff_scalar   = fr->rcoulomb;
+    rcutoff          = gmx_fjsp_set1_v2r8(rcutoff_scalar);
+    rcutoff2         = _fjsp_mul_v2r8(rcutoff,rcutoff);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_fjsp_load_shift_and_3rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
+
+        fix0             = _fjsp_setzero_v2r8();
+        fiy0             = _fjsp_setzero_v2r8();
+        fiz0             = _fjsp_setzero_v2r8();
+        fix1             = _fjsp_setzero_v2r8();
+        fiy1             = _fjsp_setzero_v2r8();
+        fiz1             = _fjsp_setzero_v2r8();
+        fix2             = _fjsp_setzero_v2r8();
+        fiy2             = _fjsp_setzero_v2r8();
+        fiz2             = _fjsp_setzero_v2r8();
+
+        /* Reset potential sums */
+        velecsum         = _fjsp_setzero_v2r8();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+            rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+            rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_fjsp_load_2real_swizzle_v2r8(charge+jnrA+0,charge+jnrB+0);
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
+            {
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _fjsp_mul_v2r8(iq0,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq00,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq00,rinv00),crf));
+            felec            = _fjsp_mul_v2r8(qq00,_fjsp_msub_v2r8(rinv00,rinvsq00,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq10,rcutoff2))
+            {
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _fjsp_mul_v2r8(iq1,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq10,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq10,rinv10),crf));
+            felec            = _fjsp_mul_v2r8(qq10,_fjsp_msub_v2r8(rinv10,rinvsq10,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq10,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq20,rcutoff2))
+            {
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _fjsp_mul_v2r8(iq2,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq20,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq20,rinv20),crf));
+            felec            = _fjsp_mul_v2r8(qq20,_fjsp_msub_v2r8(rinv20,rinvsq20,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq20,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            }
+
+            gmx_fjsp_decrement_1rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 120 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+            rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+            rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+
+            /* Load parameters for j particles */
+            jq0              = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),charge+jnrA+0);
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
+            {
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _fjsp_mul_v2r8(iq0,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq00,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq00,rinv00),crf));
+            felec            = _fjsp_mul_v2r8(qq00,_fjsp_msub_v2r8(rinv00,rinvsq00,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq10,rcutoff2))
+            {
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _fjsp_mul_v2r8(iq1,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq10,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq10,rinv10),crf));
+            felec            = _fjsp_mul_v2r8(qq10,_fjsp_msub_v2r8(rinv10,rinvsq10,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq10,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq20,rcutoff2))
+            {
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _fjsp_mul_v2r8(iq2,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq20,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq20,rinv20),crf));
+            felec            = _fjsp_mul_v2r8(qq20,_fjsp_msub_v2r8(rinv20,rinvsq20,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq20,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            }
+
+            gmx_fjsp_decrement_1rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 120 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_fjsp_update_iforce_3atom_swizzle_v2r8(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        ggid                        = gid[iidx];
+        /* Update potential energies */
+        gmx_fjsp_update_1pot_v2r8(velecsum,kernel_data->energygrp_elec+ggid);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 19 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W3_VF,outeriter*19 + inneriter*120);
+}
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecRFCut_VdwNone_GeomW3P1_F_sparc64_hpc_ace_double
+ * Electrostatics interaction: ReactionField
+ * VdW interaction:            None
+ * Geometry:                   Water3-Particle
+ * Calculate force/pot:        Force
+ */
+void
+nb_kernel_ElecRFCut_VdwNone_GeomW3P1_F_sparc64_hpc_ace_double
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwjidx0A,vdwjidx0B;
+    _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    _fjsp_v2r8       dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    _fjsp_v2r8       dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    _fjsp_v2r8       itab_tmp;
+    _fjsp_v2r8       dummy_mask,cutoff_mask;
+    _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+    _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+    union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    krf              = gmx_fjsp_set1_v2r8(fr->ic->k_rf);
+    krf2             = gmx_fjsp_set1_v2r8(fr->ic->k_rf*2.0);
+    crf              = gmx_fjsp_set1_v2r8(fr->ic->c_rf);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq0              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+0]));
+    iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+    iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+
+    /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+    rcutoff_scalar   = fr->rcoulomb;
+    rcutoff          = gmx_fjsp_set1_v2r8(rcutoff_scalar);
+    rcutoff2         = _fjsp_mul_v2r8(rcutoff,rcutoff);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_fjsp_load_shift_and_3rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
+
+        fix0             = _fjsp_setzero_v2r8();
+        fiy0             = _fjsp_setzero_v2r8();
+        fiz0             = _fjsp_setzero_v2r8();
+        fix1             = _fjsp_setzero_v2r8();
+        fiy1             = _fjsp_setzero_v2r8();
+        fiz1             = _fjsp_setzero_v2r8();
+        fix2             = _fjsp_setzero_v2r8();
+        fiy2             = _fjsp_setzero_v2r8();
+        fiz2             = _fjsp_setzero_v2r8();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+            rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+            rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_fjsp_load_2real_swizzle_v2r8(charge+jnrA+0,charge+jnrB+0);
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
+            {
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _fjsp_mul_v2r8(iq0,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq00,_fjsp_msub_v2r8(rinv00,rinvsq00,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq10,rcutoff2))
+            {
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _fjsp_mul_v2r8(iq1,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq10,_fjsp_msub_v2r8(rinv10,rinvsq10,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq10,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq20,rcutoff2))
+            {
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _fjsp_mul_v2r8(iq2,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq20,_fjsp_msub_v2r8(rinv20,rinvsq20,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq20,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            }
+
+            gmx_fjsp_decrement_1rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 102 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+            rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+            rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+
+            /* Load parameters for j particles */
+            jq0              = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),charge+jnrA+0);
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
+            {
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _fjsp_mul_v2r8(iq0,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq00,_fjsp_msub_v2r8(rinv00,rinvsq00,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq10,rcutoff2))
+            {
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _fjsp_mul_v2r8(iq1,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq10,_fjsp_msub_v2r8(rinv10,rinvsq10,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq10,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq20,rcutoff2))
+            {
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _fjsp_mul_v2r8(iq2,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq20,_fjsp_msub_v2r8(rinv20,rinvsq20,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq20,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            }
+
+            gmx_fjsp_decrement_1rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 102 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_fjsp_update_iforce_3atom_swizzle_v2r8(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 18 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W3_F,outeriter*18 + inneriter*102);
+}
diff --git a/src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecRFCut_VdwNone_GeomW3W3_sparc64_hpc_ace_double.c b/src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecRFCut_VdwNone_GeomW3W3_sparc64_hpc_ace_double.c
new file mode 100644 (file)
index 0000000..281bccc
--- /dev/null
@@ -0,0 +1,1820 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2012, by the GROMACS development team, led by
+ * David van der Spoel, Berk Hess, Erik Lindahl, and including many
+ * others, as listed in the AUTHORS file in the top-level source
+ * directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*
+ * Note: this file was generated by the GROMACS sparc64_hpc_ace_double kernel generator.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "kernelutil_sparc64_hpc_ace_double.h"
+
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecRFCut_VdwNone_GeomW3W3_VF_sparc64_hpc_ace_double
+ * Electrostatics interaction: ReactionField
+ * VdW interaction:            None
+ * Geometry:                   Water3-Water3
+ * Calculate force/pot:        PotentialAndForce
+ */
+void
+nb_kernel_ElecRFCut_VdwNone_GeomW3W3_VF_sparc64_hpc_ace_double
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwjidx0A,vdwjidx0B;
+    _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    int              vdwjidx1A,vdwjidx1B;
+    _fjsp_v2r8       jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+    int              vdwjidx2A,vdwjidx2B;
+    _fjsp_v2r8       jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+    _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    _fjsp_v2r8       dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
+    _fjsp_v2r8       dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
+    _fjsp_v2r8       dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    _fjsp_v2r8       dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+    _fjsp_v2r8       dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+    _fjsp_v2r8       dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    _fjsp_v2r8       dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+    _fjsp_v2r8       dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+    _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    _fjsp_v2r8       itab_tmp;
+    _fjsp_v2r8       dummy_mask,cutoff_mask;
+    _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+    _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+    union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    krf              = gmx_fjsp_set1_v2r8(fr->ic->k_rf);
+    krf2             = gmx_fjsp_set1_v2r8(fr->ic->k_rf*2.0);
+    crf              = gmx_fjsp_set1_v2r8(fr->ic->c_rf);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq0              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+0]));
+    iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+    iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+
+    jq0              = gmx_fjsp_set1_v2r8(charge[inr+0]);
+    jq1              = gmx_fjsp_set1_v2r8(charge[inr+1]);
+    jq2              = gmx_fjsp_set1_v2r8(charge[inr+2]);
+    qq00             = _fjsp_mul_v2r8(iq0,jq0);
+    qq01             = _fjsp_mul_v2r8(iq0,jq1);
+    qq02             = _fjsp_mul_v2r8(iq0,jq2);
+    qq10             = _fjsp_mul_v2r8(iq1,jq0);
+    qq11             = _fjsp_mul_v2r8(iq1,jq1);
+    qq12             = _fjsp_mul_v2r8(iq1,jq2);
+    qq20             = _fjsp_mul_v2r8(iq2,jq0);
+    qq21             = _fjsp_mul_v2r8(iq2,jq1);
+    qq22             = _fjsp_mul_v2r8(iq2,jq2);
+
+    /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+    rcutoff_scalar   = fr->rcoulomb;
+    rcutoff          = gmx_fjsp_set1_v2r8(rcutoff_scalar);
+    rcutoff2         = _fjsp_mul_v2r8(rcutoff,rcutoff);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_fjsp_load_shift_and_3rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
+
+        fix0             = _fjsp_setzero_v2r8();
+        fiy0             = _fjsp_setzero_v2r8();
+        fiz0             = _fjsp_setzero_v2r8();
+        fix1             = _fjsp_setzero_v2r8();
+        fiy1             = _fjsp_setzero_v2r8();
+        fiz1             = _fjsp_setzero_v2r8();
+        fix2             = _fjsp_setzero_v2r8();
+        fiy2             = _fjsp_setzero_v2r8();
+        fiz2             = _fjsp_setzero_v2r8();
+
+        /* Reset potential sums */
+        velecsum         = _fjsp_setzero_v2r8();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_3rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx01             = _fjsp_sub_v2r8(ix0,jx1);
+            dy01             = _fjsp_sub_v2r8(iy0,jy1);
+            dz01             = _fjsp_sub_v2r8(iz0,jz1);
+            dx02             = _fjsp_sub_v2r8(ix0,jx2);
+            dy02             = _fjsp_sub_v2r8(iy0,jy2);
+            dz02             = _fjsp_sub_v2r8(iz0,jz2);
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx11             = _fjsp_sub_v2r8(ix1,jx1);
+            dy11             = _fjsp_sub_v2r8(iy1,jy1);
+            dz11             = _fjsp_sub_v2r8(iz1,jz1);
+            dx12             = _fjsp_sub_v2r8(ix1,jx2);
+            dy12             = _fjsp_sub_v2r8(iy1,jy2);
+            dz12             = _fjsp_sub_v2r8(iz1,jz2);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+            dx21             = _fjsp_sub_v2r8(ix2,jx1);
+            dy21             = _fjsp_sub_v2r8(iy2,jy1);
+            dz21             = _fjsp_sub_v2r8(iz2,jz1);
+            dx22             = _fjsp_sub_v2r8(ix2,jx2);
+            dy22             = _fjsp_sub_v2r8(iy2,jy2);
+            dz22             = _fjsp_sub_v2r8(iz2,jz2);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq01            = gmx_fjsp_calc_rsq_v2r8(dx01,dy01,dz01);
+            rsq02            = gmx_fjsp_calc_rsq_v2r8(dx02,dy02,dz02);
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+            rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+            rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+            rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv01           = gmx_fjsp_invsqrt_v2r8(rsq01);
+            rinv02           = gmx_fjsp_invsqrt_v2r8(rsq02);
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+            rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+            rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+            rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+            rinvsq01         = _fjsp_mul_v2r8(rinv01,rinv01);
+            rinvsq02         = _fjsp_mul_v2r8(rinv02,rinv02);
+            rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+            rinvsq11         = _fjsp_mul_v2r8(rinv11,rinv11);
+            rinvsq12         = _fjsp_mul_v2r8(rinv12,rinv12);
+            rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+            rinvsq21         = _fjsp_mul_v2r8(rinv21,rinv21);
+            rinvsq22         = _fjsp_mul_v2r8(rinv22,rinv22);
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+            fjx1             = _fjsp_setzero_v2r8();
+            fjy1             = _fjsp_setzero_v2r8();
+            fjz1             = _fjsp_setzero_v2r8();
+            fjx2             = _fjsp_setzero_v2r8();
+            fjy2             = _fjsp_setzero_v2r8();
+            fjz2             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq00,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq00,rinv00),crf));
+            felec            = _fjsp_mul_v2r8(qq00,_fjsp_msub_v2r8(rinv00,rinvsq00,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq01,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq01,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq01,rinv01),crf));
+            felec            = _fjsp_mul_v2r8(qq01,_fjsp_msub_v2r8(rinv01,rinvsq01,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq01,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx01,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy01,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz01,fscal,fiz0);
+            
+            fjx1             = _fjsp_madd_v2r8(dx01,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy01,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz01,fscal,fjz1);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq02,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq02,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq02,rinv02),crf));
+            felec            = _fjsp_mul_v2r8(qq02,_fjsp_msub_v2r8(rinv02,rinvsq02,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq02,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx02,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy02,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz02,fscal,fiz0);
+            
+            fjx2             = _fjsp_madd_v2r8(dx02,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy02,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz02,fscal,fjz2);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq10,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq10,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq10,rinv10),crf));
+            felec            = _fjsp_mul_v2r8(qq10,_fjsp_msub_v2r8(rinv10,rinvsq10,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq10,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq11,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq11,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq11,rinv11),crf));
+            felec            = _fjsp_mul_v2r8(qq11,_fjsp_msub_v2r8(rinv11,rinvsq11,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq11,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+            
+            fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq12,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq12,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq12,rinv12),crf));
+            felec            = _fjsp_mul_v2r8(qq12,_fjsp_msub_v2r8(rinv12,rinvsq12,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq12,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+            
+            fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq20,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq20,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq20,rinv20),crf));
+            felec            = _fjsp_mul_v2r8(qq20,_fjsp_msub_v2r8(rinv20,rinvsq20,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq20,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq21,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq21,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq21,rinv21),crf));
+            felec            = _fjsp_mul_v2r8(qq21,_fjsp_msub_v2r8(rinv21,rinvsq21,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq21,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+            
+            fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq22,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq22,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq22,rinv22),crf));
+            felec            = _fjsp_mul_v2r8(qq22,_fjsp_msub_v2r8(rinv22,rinvsq22,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq22,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+            
+            fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+
+            }
+
+            gmx_fjsp_decrement_3rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+
+            /* Inner loop uses 351 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_3rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx01             = _fjsp_sub_v2r8(ix0,jx1);
+            dy01             = _fjsp_sub_v2r8(iy0,jy1);
+            dz01             = _fjsp_sub_v2r8(iz0,jz1);
+            dx02             = _fjsp_sub_v2r8(ix0,jx2);
+            dy02             = _fjsp_sub_v2r8(iy0,jy2);
+            dz02             = _fjsp_sub_v2r8(iz0,jz2);
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx11             = _fjsp_sub_v2r8(ix1,jx1);
+            dy11             = _fjsp_sub_v2r8(iy1,jy1);
+            dz11             = _fjsp_sub_v2r8(iz1,jz1);
+            dx12             = _fjsp_sub_v2r8(ix1,jx2);
+            dy12             = _fjsp_sub_v2r8(iy1,jy2);
+            dz12             = _fjsp_sub_v2r8(iz1,jz2);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+            dx21             = _fjsp_sub_v2r8(ix2,jx1);
+            dy21             = _fjsp_sub_v2r8(iy2,jy1);
+            dz21             = _fjsp_sub_v2r8(iz2,jz1);
+            dx22             = _fjsp_sub_v2r8(ix2,jx2);
+            dy22             = _fjsp_sub_v2r8(iy2,jy2);
+            dz22             = _fjsp_sub_v2r8(iz2,jz2);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq01            = gmx_fjsp_calc_rsq_v2r8(dx01,dy01,dz01);
+            rsq02            = gmx_fjsp_calc_rsq_v2r8(dx02,dy02,dz02);
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+            rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+            rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+            rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv01           = gmx_fjsp_invsqrt_v2r8(rsq01);
+            rinv02           = gmx_fjsp_invsqrt_v2r8(rsq02);
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+            rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+            rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+            rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+            rinvsq01         = _fjsp_mul_v2r8(rinv01,rinv01);
+            rinvsq02         = _fjsp_mul_v2r8(rinv02,rinv02);
+            rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+            rinvsq11         = _fjsp_mul_v2r8(rinv11,rinv11);
+            rinvsq12         = _fjsp_mul_v2r8(rinv12,rinv12);
+            rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+            rinvsq21         = _fjsp_mul_v2r8(rinv21,rinv21);
+            rinvsq22         = _fjsp_mul_v2r8(rinv22,rinv22);
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+            fjx1             = _fjsp_setzero_v2r8();
+            fjy1             = _fjsp_setzero_v2r8();
+            fjz1             = _fjsp_setzero_v2r8();
+            fjx2             = _fjsp_setzero_v2r8();
+            fjy2             = _fjsp_setzero_v2r8();
+            fjz2             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq00,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq00,rinv00),crf));
+            felec            = _fjsp_mul_v2r8(qq00,_fjsp_msub_v2r8(rinv00,rinvsq00,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq01,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq01,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq01,rinv01),crf));
+            felec            = _fjsp_mul_v2r8(qq01,_fjsp_msub_v2r8(rinv01,rinvsq01,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq01,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx01,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy01,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz01,fscal,fiz0);
+            
+            fjx1             = _fjsp_madd_v2r8(dx01,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy01,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz01,fscal,fjz1);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq02,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq02,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq02,rinv02),crf));
+            felec            = _fjsp_mul_v2r8(qq02,_fjsp_msub_v2r8(rinv02,rinvsq02,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq02,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx02,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy02,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz02,fscal,fiz0);
+            
+            fjx2             = _fjsp_madd_v2r8(dx02,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy02,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz02,fscal,fjz2);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq10,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq10,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq10,rinv10),crf));
+            felec            = _fjsp_mul_v2r8(qq10,_fjsp_msub_v2r8(rinv10,rinvsq10,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq10,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq11,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq11,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq11,rinv11),crf));
+            felec            = _fjsp_mul_v2r8(qq11,_fjsp_msub_v2r8(rinv11,rinvsq11,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq11,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+            
+            fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq12,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq12,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq12,rinv12),crf));
+            felec            = _fjsp_mul_v2r8(qq12,_fjsp_msub_v2r8(rinv12,rinvsq12,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq12,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+            
+            fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq20,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq20,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq20,rinv20),crf));
+            felec            = _fjsp_mul_v2r8(qq20,_fjsp_msub_v2r8(rinv20,rinvsq20,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq20,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq21,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq21,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq21,rinv21),crf));
+            felec            = _fjsp_mul_v2r8(qq21,_fjsp_msub_v2r8(rinv21,rinvsq21,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq21,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+            
+            fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq22,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq22,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq22,rinv22),crf));
+            felec            = _fjsp_mul_v2r8(qq22,_fjsp_msub_v2r8(rinv22,rinvsq22,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq22,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+            
+            fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+
+            }
+
+            gmx_fjsp_decrement_3rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+
+            /* Inner loop uses 351 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_fjsp_update_iforce_3atom_swizzle_v2r8(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        ggid                        = gid[iidx];
+        /* Update potential energies */
+        gmx_fjsp_update_1pot_v2r8(velecsum,kernel_data->energygrp_elec+ggid);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 19 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W3W3_VF,outeriter*19 + inneriter*351);
+}
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecRFCut_VdwNone_GeomW3W3_F_sparc64_hpc_ace_double
+ * Electrostatics interaction: ReactionField
+ * VdW interaction:            None
+ * Geometry:                   Water3-Water3
+ * Calculate force/pot:        Force
+ */
+void
+nb_kernel_ElecRFCut_VdwNone_GeomW3W3_F_sparc64_hpc_ace_double
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwjidx0A,vdwjidx0B;
+    _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    int              vdwjidx1A,vdwjidx1B;
+    _fjsp_v2r8       jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+    int              vdwjidx2A,vdwjidx2B;
+    _fjsp_v2r8       jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+    _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    _fjsp_v2r8       dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
+    _fjsp_v2r8       dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
+    _fjsp_v2r8       dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    _fjsp_v2r8       dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+    _fjsp_v2r8       dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+    _fjsp_v2r8       dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    _fjsp_v2r8       dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+    _fjsp_v2r8       dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+    _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    _fjsp_v2r8       itab_tmp;
+    _fjsp_v2r8       dummy_mask,cutoff_mask;
+    _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+    _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+    union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    krf              = gmx_fjsp_set1_v2r8(fr->ic->k_rf);
+    krf2             = gmx_fjsp_set1_v2r8(fr->ic->k_rf*2.0);
+    crf              = gmx_fjsp_set1_v2r8(fr->ic->c_rf);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq0              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+0]));
+    iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+    iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+
+    jq0              = gmx_fjsp_set1_v2r8(charge[inr+0]);
+    jq1              = gmx_fjsp_set1_v2r8(charge[inr+1]);
+    jq2              = gmx_fjsp_set1_v2r8(charge[inr+2]);
+    qq00             = _fjsp_mul_v2r8(iq0,jq0);
+    qq01             = _fjsp_mul_v2r8(iq0,jq1);
+    qq02             = _fjsp_mul_v2r8(iq0,jq2);
+    qq10             = _fjsp_mul_v2r8(iq1,jq0);
+    qq11             = _fjsp_mul_v2r8(iq1,jq1);
+    qq12             = _fjsp_mul_v2r8(iq1,jq2);
+    qq20             = _fjsp_mul_v2r8(iq2,jq0);
+    qq21             = _fjsp_mul_v2r8(iq2,jq1);
+    qq22             = _fjsp_mul_v2r8(iq2,jq2);
+
+    /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+    rcutoff_scalar   = fr->rcoulomb;
+    rcutoff          = gmx_fjsp_set1_v2r8(rcutoff_scalar);
+    rcutoff2         = _fjsp_mul_v2r8(rcutoff,rcutoff);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_fjsp_load_shift_and_3rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
+
+        fix0             = _fjsp_setzero_v2r8();
+        fiy0             = _fjsp_setzero_v2r8();
+        fiz0             = _fjsp_setzero_v2r8();
+        fix1             = _fjsp_setzero_v2r8();
+        fiy1             = _fjsp_setzero_v2r8();
+        fiz1             = _fjsp_setzero_v2r8();
+        fix2             = _fjsp_setzero_v2r8();
+        fiy2             = _fjsp_setzero_v2r8();
+        fiz2             = _fjsp_setzero_v2r8();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_3rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx01             = _fjsp_sub_v2r8(ix0,jx1);
+            dy01             = _fjsp_sub_v2r8(iy0,jy1);
+            dz01             = _fjsp_sub_v2r8(iz0,jz1);
+            dx02             = _fjsp_sub_v2r8(ix0,jx2);
+            dy02             = _fjsp_sub_v2r8(iy0,jy2);
+            dz02             = _fjsp_sub_v2r8(iz0,jz2);
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx11             = _fjsp_sub_v2r8(ix1,jx1);
+            dy11             = _fjsp_sub_v2r8(iy1,jy1);
+            dz11             = _fjsp_sub_v2r8(iz1,jz1);
+            dx12             = _fjsp_sub_v2r8(ix1,jx2);
+            dy12             = _fjsp_sub_v2r8(iy1,jy2);
+            dz12             = _fjsp_sub_v2r8(iz1,jz2);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+            dx21             = _fjsp_sub_v2r8(ix2,jx1);
+            dy21             = _fjsp_sub_v2r8(iy2,jy1);
+            dz21             = _fjsp_sub_v2r8(iz2,jz1);
+            dx22             = _fjsp_sub_v2r8(ix2,jx2);
+            dy22             = _fjsp_sub_v2r8(iy2,jy2);
+            dz22             = _fjsp_sub_v2r8(iz2,jz2);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq01            = gmx_fjsp_calc_rsq_v2r8(dx01,dy01,dz01);
+            rsq02            = gmx_fjsp_calc_rsq_v2r8(dx02,dy02,dz02);
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+            rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+            rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+            rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv01           = gmx_fjsp_invsqrt_v2r8(rsq01);
+            rinv02           = gmx_fjsp_invsqrt_v2r8(rsq02);
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+            rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+            rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+            rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+            rinvsq01         = _fjsp_mul_v2r8(rinv01,rinv01);
+            rinvsq02         = _fjsp_mul_v2r8(rinv02,rinv02);
+            rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+            rinvsq11         = _fjsp_mul_v2r8(rinv11,rinv11);
+            rinvsq12         = _fjsp_mul_v2r8(rinv12,rinv12);
+            rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+            rinvsq21         = _fjsp_mul_v2r8(rinv21,rinv21);
+            rinvsq22         = _fjsp_mul_v2r8(rinv22,rinv22);
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+            fjx1             = _fjsp_setzero_v2r8();
+            fjy1             = _fjsp_setzero_v2r8();
+            fjz1             = _fjsp_setzero_v2r8();
+            fjx2             = _fjsp_setzero_v2r8();
+            fjy2             = _fjsp_setzero_v2r8();
+            fjz2             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq00,_fjsp_msub_v2r8(rinv00,rinvsq00,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq01,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq01,_fjsp_msub_v2r8(rinv01,rinvsq01,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq01,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx01,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy01,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz01,fscal,fiz0);
+            
+            fjx1             = _fjsp_madd_v2r8(dx01,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy01,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz01,fscal,fjz1);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq02,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq02,_fjsp_msub_v2r8(rinv02,rinvsq02,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq02,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx02,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy02,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz02,fscal,fiz0);
+            
+            fjx2             = _fjsp_madd_v2r8(dx02,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy02,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz02,fscal,fjz2);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq10,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq10,_fjsp_msub_v2r8(rinv10,rinvsq10,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq10,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq11,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq11,_fjsp_msub_v2r8(rinv11,rinvsq11,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq11,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+            
+            fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq12,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq12,_fjsp_msub_v2r8(rinv12,rinvsq12,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq12,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+            
+            fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq20,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq20,_fjsp_msub_v2r8(rinv20,rinvsq20,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq20,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq21,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq21,_fjsp_msub_v2r8(rinv21,rinvsq21,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq21,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+            
+            fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq22,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq22,_fjsp_msub_v2r8(rinv22,rinvsq22,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq22,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+            
+            fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+
+            }
+
+            gmx_fjsp_decrement_3rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+
+            /* Inner loop uses 297 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_3rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx01             = _fjsp_sub_v2r8(ix0,jx1);
+            dy01             = _fjsp_sub_v2r8(iy0,jy1);
+            dz01             = _fjsp_sub_v2r8(iz0,jz1);
+            dx02             = _fjsp_sub_v2r8(ix0,jx2);
+            dy02             = _fjsp_sub_v2r8(iy0,jy2);
+            dz02             = _fjsp_sub_v2r8(iz0,jz2);
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx11             = _fjsp_sub_v2r8(ix1,jx1);
+            dy11             = _fjsp_sub_v2r8(iy1,jy1);
+            dz11             = _fjsp_sub_v2r8(iz1,jz1);
+            dx12             = _fjsp_sub_v2r8(ix1,jx2);
+            dy12             = _fjsp_sub_v2r8(iy1,jy2);
+            dz12             = _fjsp_sub_v2r8(iz1,jz2);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+            dx21             = _fjsp_sub_v2r8(ix2,jx1);
+            dy21             = _fjsp_sub_v2r8(iy2,jy1);
+            dz21             = _fjsp_sub_v2r8(iz2,jz1);
+            dx22             = _fjsp_sub_v2r8(ix2,jx2);
+            dy22             = _fjsp_sub_v2r8(iy2,jy2);
+            dz22             = _fjsp_sub_v2r8(iz2,jz2);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq01            = gmx_fjsp_calc_rsq_v2r8(dx01,dy01,dz01);
+            rsq02            = gmx_fjsp_calc_rsq_v2r8(dx02,dy02,dz02);
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+            rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+            rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+            rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv01           = gmx_fjsp_invsqrt_v2r8(rsq01);
+            rinv02           = gmx_fjsp_invsqrt_v2r8(rsq02);
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+            rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+            rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+            rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+            rinvsq01         = _fjsp_mul_v2r8(rinv01,rinv01);
+            rinvsq02         = _fjsp_mul_v2r8(rinv02,rinv02);
+            rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+            rinvsq11         = _fjsp_mul_v2r8(rinv11,rinv11);
+            rinvsq12         = _fjsp_mul_v2r8(rinv12,rinv12);
+            rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+            rinvsq21         = _fjsp_mul_v2r8(rinv21,rinv21);
+            rinvsq22         = _fjsp_mul_v2r8(rinv22,rinv22);
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+            fjx1             = _fjsp_setzero_v2r8();
+            fjy1             = _fjsp_setzero_v2r8();
+            fjz1             = _fjsp_setzero_v2r8();
+            fjx2             = _fjsp_setzero_v2r8();
+            fjy2             = _fjsp_setzero_v2r8();
+            fjz2             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq00,_fjsp_msub_v2r8(rinv00,rinvsq00,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq01,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq01,_fjsp_msub_v2r8(rinv01,rinvsq01,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq01,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx01,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy01,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz01,fscal,fiz0);
+            
+            fjx1             = _fjsp_madd_v2r8(dx01,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy01,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz01,fscal,fjz1);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq02,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq02,_fjsp_msub_v2r8(rinv02,rinvsq02,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq02,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx02,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy02,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz02,fscal,fiz0);
+            
+            fjx2             = _fjsp_madd_v2r8(dx02,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy02,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz02,fscal,fjz2);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq10,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq10,_fjsp_msub_v2r8(rinv10,rinvsq10,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq10,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq11,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq11,_fjsp_msub_v2r8(rinv11,rinvsq11,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq11,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+            
+            fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq12,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq12,_fjsp_msub_v2r8(rinv12,rinvsq12,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq12,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+            
+            fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq20,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq20,_fjsp_msub_v2r8(rinv20,rinvsq20,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq20,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq21,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq21,_fjsp_msub_v2r8(rinv21,rinvsq21,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq21,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+            
+            fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq22,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq22,_fjsp_msub_v2r8(rinv22,rinvsq22,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq22,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+            
+            fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+
+            }
+
+            gmx_fjsp_decrement_3rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+
+            /* Inner loop uses 297 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_fjsp_update_iforce_3atom_swizzle_v2r8(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 18 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W3W3_F,outeriter*18 + inneriter*297);
+}
diff --git a/src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecRFCut_VdwNone_GeomW4P1_sparc64_hpc_ace_double.c b/src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecRFCut_VdwNone_GeomW4P1_sparc64_hpc_ace_double.c
new file mode 100644 (file)
index 0000000..4896f8f
--- /dev/null
@@ -0,0 +1,916 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2012, by the GROMACS development team, led by
+ * David van der Spoel, Berk Hess, Erik Lindahl, and including many
+ * others, as listed in the AUTHORS file in the top-level source
+ * directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*
+ * Note: this file was generated by the GROMACS sparc64_hpc_ace_double kernel generator.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "kernelutil_sparc64_hpc_ace_double.h"
+
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecRFCut_VdwNone_GeomW4P1_VF_sparc64_hpc_ace_double
+ * Electrostatics interaction: ReactionField
+ * VdW interaction:            None
+ * Geometry:                   Water4-Particle
+ * Calculate force/pot:        PotentialAndForce
+ */
+void
+nb_kernel_ElecRFCut_VdwNone_GeomW4P1_VF_sparc64_hpc_ace_double
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset1;
+    _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwioffset3;
+    _fjsp_v2r8       ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+    int              vdwjidx0A,vdwjidx0B;
+    _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    _fjsp_v2r8       dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    _fjsp_v2r8       dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    _fjsp_v2r8       dx30,dy30,dz30,rsq30,rinv30,rinvsq30,r30,qq30,c6_30,c12_30;
+    _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    _fjsp_v2r8       itab_tmp;
+    _fjsp_v2r8       dummy_mask,cutoff_mask;
+    _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+    _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+    union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    krf              = gmx_fjsp_set1_v2r8(fr->ic->k_rf);
+    krf2             = gmx_fjsp_set1_v2r8(fr->ic->k_rf*2.0);
+    crf              = gmx_fjsp_set1_v2r8(fr->ic->c_rf);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+    iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+    iq3              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+3]));
+
+    /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+    rcutoff_scalar   = fr->rcoulomb;
+    rcutoff          = gmx_fjsp_set1_v2r8(rcutoff_scalar);
+    rcutoff2         = _fjsp_mul_v2r8(rcutoff,rcutoff);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_fjsp_load_shift_and_3rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset+DIM,
+                                                 &ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
+
+        fix1             = _fjsp_setzero_v2r8();
+        fiy1             = _fjsp_setzero_v2r8();
+        fiz1             = _fjsp_setzero_v2r8();
+        fix2             = _fjsp_setzero_v2r8();
+        fiy2             = _fjsp_setzero_v2r8();
+        fiz2             = _fjsp_setzero_v2r8();
+        fix3             = _fjsp_setzero_v2r8();
+        fiy3             = _fjsp_setzero_v2r8();
+        fiz3             = _fjsp_setzero_v2r8();
+
+        /* Reset potential sums */
+        velecsum         = _fjsp_setzero_v2r8();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+            dx30             = _fjsp_sub_v2r8(ix3,jx0);
+            dy30             = _fjsp_sub_v2r8(iy3,jy0);
+            dz30             = _fjsp_sub_v2r8(iz3,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+            rsq30            = gmx_fjsp_calc_rsq_v2r8(dx30,dy30,dz30);
+
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+            rinv30           = gmx_fjsp_invsqrt_v2r8(rsq30);
+
+            rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+            rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+            rinvsq30         = _fjsp_mul_v2r8(rinv30,rinv30);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_fjsp_load_2real_swizzle_v2r8(charge+jnrA+0,charge+jnrB+0);
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq10,rcutoff2))
+            {
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _fjsp_mul_v2r8(iq1,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq10,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq10,rinv10),crf));
+            felec            = _fjsp_mul_v2r8(qq10,_fjsp_msub_v2r8(rinv10,rinvsq10,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq10,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq20,rcutoff2))
+            {
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _fjsp_mul_v2r8(iq2,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq20,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq20,rinv20),crf));
+            felec            = _fjsp_mul_v2r8(qq20,_fjsp_msub_v2r8(rinv20,rinvsq20,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq20,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq30,rcutoff2))
+            {
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq30             = _fjsp_mul_v2r8(iq3,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq30,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq30,rinv30),crf));
+            felec            = _fjsp_mul_v2r8(qq30,_fjsp_msub_v2r8(rinv30,rinvsq30,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq30,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx30,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy30,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz30,fscal,fiz3);
+            
+            fjx0             = _fjsp_madd_v2r8(dx30,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy30,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz30,fscal,fjz0);
+
+            }
+
+            gmx_fjsp_decrement_1rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 120 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+            dx30             = _fjsp_sub_v2r8(ix3,jx0);
+            dy30             = _fjsp_sub_v2r8(iy3,jy0);
+            dz30             = _fjsp_sub_v2r8(iz3,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+            rsq30            = gmx_fjsp_calc_rsq_v2r8(dx30,dy30,dz30);
+
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+            rinv30           = gmx_fjsp_invsqrt_v2r8(rsq30);
+
+            rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+            rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+            rinvsq30         = _fjsp_mul_v2r8(rinv30,rinv30);
+
+            /* Load parameters for j particles */
+            jq0              = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),charge+jnrA+0);
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq10,rcutoff2))
+            {
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _fjsp_mul_v2r8(iq1,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq10,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq10,rinv10),crf));
+            felec            = _fjsp_mul_v2r8(qq10,_fjsp_msub_v2r8(rinv10,rinvsq10,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq10,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq20,rcutoff2))
+            {
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _fjsp_mul_v2r8(iq2,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq20,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq20,rinv20),crf));
+            felec            = _fjsp_mul_v2r8(qq20,_fjsp_msub_v2r8(rinv20,rinvsq20,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq20,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq30,rcutoff2))
+            {
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq30             = _fjsp_mul_v2r8(iq3,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq30,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq30,rinv30),crf));
+            felec            = _fjsp_mul_v2r8(qq30,_fjsp_msub_v2r8(rinv30,rinvsq30,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq30,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx30,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy30,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz30,fscal,fiz3);
+            
+            fjx0             = _fjsp_madd_v2r8(dx30,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy30,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz30,fscal,fjz0);
+
+            }
+
+            gmx_fjsp_decrement_1rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 120 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_fjsp_update_iforce_3atom_swizzle_v2r8(fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+                                              f+i_coord_offset+DIM,fshift+i_shift_offset);
+
+        ggid                        = gid[iidx];
+        /* Update potential energies */
+        gmx_fjsp_update_1pot_v2r8(velecsum,kernel_data->energygrp_elec+ggid);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 19 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W4_VF,outeriter*19 + inneriter*120);
+}
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecRFCut_VdwNone_GeomW4P1_F_sparc64_hpc_ace_double
+ * Electrostatics interaction: ReactionField
+ * VdW interaction:            None
+ * Geometry:                   Water4-Particle
+ * Calculate force/pot:        Force
+ */
+void
+nb_kernel_ElecRFCut_VdwNone_GeomW4P1_F_sparc64_hpc_ace_double
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset1;
+    _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwioffset3;
+    _fjsp_v2r8       ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+    int              vdwjidx0A,vdwjidx0B;
+    _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    _fjsp_v2r8       dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    _fjsp_v2r8       dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    _fjsp_v2r8       dx30,dy30,dz30,rsq30,rinv30,rinvsq30,r30,qq30,c6_30,c12_30;
+    _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    _fjsp_v2r8       itab_tmp;
+    _fjsp_v2r8       dummy_mask,cutoff_mask;
+    _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+    _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+    union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    krf              = gmx_fjsp_set1_v2r8(fr->ic->k_rf);
+    krf2             = gmx_fjsp_set1_v2r8(fr->ic->k_rf*2.0);
+    crf              = gmx_fjsp_set1_v2r8(fr->ic->c_rf);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+    iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+    iq3              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+3]));
+
+    /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+    rcutoff_scalar   = fr->rcoulomb;
+    rcutoff          = gmx_fjsp_set1_v2r8(rcutoff_scalar);
+    rcutoff2         = _fjsp_mul_v2r8(rcutoff,rcutoff);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_fjsp_load_shift_and_3rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset+DIM,
+                                                 &ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
+
+        fix1             = _fjsp_setzero_v2r8();
+        fiy1             = _fjsp_setzero_v2r8();
+        fiz1             = _fjsp_setzero_v2r8();
+        fix2             = _fjsp_setzero_v2r8();
+        fiy2             = _fjsp_setzero_v2r8();
+        fiz2             = _fjsp_setzero_v2r8();
+        fix3             = _fjsp_setzero_v2r8();
+        fiy3             = _fjsp_setzero_v2r8();
+        fiz3             = _fjsp_setzero_v2r8();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+            dx30             = _fjsp_sub_v2r8(ix3,jx0);
+            dy30             = _fjsp_sub_v2r8(iy3,jy0);
+            dz30             = _fjsp_sub_v2r8(iz3,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+            rsq30            = gmx_fjsp_calc_rsq_v2r8(dx30,dy30,dz30);
+
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+            rinv30           = gmx_fjsp_invsqrt_v2r8(rsq30);
+
+            rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+            rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+            rinvsq30         = _fjsp_mul_v2r8(rinv30,rinv30);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_fjsp_load_2real_swizzle_v2r8(charge+jnrA+0,charge+jnrB+0);
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq10,rcutoff2))
+            {
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _fjsp_mul_v2r8(iq1,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq10,_fjsp_msub_v2r8(rinv10,rinvsq10,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq10,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq20,rcutoff2))
+            {
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _fjsp_mul_v2r8(iq2,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq20,_fjsp_msub_v2r8(rinv20,rinvsq20,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq20,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq30,rcutoff2))
+            {
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq30             = _fjsp_mul_v2r8(iq3,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq30,_fjsp_msub_v2r8(rinv30,rinvsq30,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq30,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx30,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy30,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz30,fscal,fiz3);
+            
+            fjx0             = _fjsp_madd_v2r8(dx30,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy30,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz30,fscal,fjz0);
+
+            }
+
+            gmx_fjsp_decrement_1rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 102 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+            dx30             = _fjsp_sub_v2r8(ix3,jx0);
+            dy30             = _fjsp_sub_v2r8(iy3,jy0);
+            dz30             = _fjsp_sub_v2r8(iz3,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+            rsq30            = gmx_fjsp_calc_rsq_v2r8(dx30,dy30,dz30);
+
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+            rinv30           = gmx_fjsp_invsqrt_v2r8(rsq30);
+
+            rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+            rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+            rinvsq30         = _fjsp_mul_v2r8(rinv30,rinv30);
+
+            /* Load parameters for j particles */
+            jq0              = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),charge+jnrA+0);
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq10,rcutoff2))
+            {
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _fjsp_mul_v2r8(iq1,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq10,_fjsp_msub_v2r8(rinv10,rinvsq10,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq10,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq20,rcutoff2))
+            {
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _fjsp_mul_v2r8(iq2,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq20,_fjsp_msub_v2r8(rinv20,rinvsq20,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq20,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq30,rcutoff2))
+            {
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq30             = _fjsp_mul_v2r8(iq3,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq30,_fjsp_msub_v2r8(rinv30,rinvsq30,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq30,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx30,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy30,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz30,fscal,fiz3);
+            
+            fjx0             = _fjsp_madd_v2r8(dx30,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy30,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz30,fscal,fjz0);
+
+            }
+
+            gmx_fjsp_decrement_1rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 102 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_fjsp_update_iforce_3atom_swizzle_v2r8(fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+                                              f+i_coord_offset+DIM,fshift+i_shift_offset);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 18 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W4_F,outeriter*18 + inneriter*102);
+}
diff --git a/src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecRFCut_VdwNone_GeomW4W4_sparc64_hpc_ace_double.c b/src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecRFCut_VdwNone_GeomW4W4_sparc64_hpc_ace_double.c
new file mode 100644 (file)
index 0000000..7d92949
--- /dev/null
@@ -0,0 +1,1820 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2012, by the GROMACS development team, led by
+ * David van der Spoel, Berk Hess, Erik Lindahl, and including many
+ * others, as listed in the AUTHORS file in the top-level source
+ * directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*
+ * Note: this file was generated by the GROMACS sparc64_hpc_ace_double kernel generator.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "kernelutil_sparc64_hpc_ace_double.h"
+
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecRFCut_VdwNone_GeomW4W4_VF_sparc64_hpc_ace_double
+ * Electrostatics interaction: ReactionField
+ * VdW interaction:            None
+ * Geometry:                   Water4-Water4
+ * Calculate force/pot:        PotentialAndForce
+ */
+void
+nb_kernel_ElecRFCut_VdwNone_GeomW4W4_VF_sparc64_hpc_ace_double
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset1;
+    _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwioffset3;
+    _fjsp_v2r8       ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+    int              vdwjidx1A,vdwjidx1B;
+    _fjsp_v2r8       jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+    int              vdwjidx2A,vdwjidx2B;
+    _fjsp_v2r8       jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+    int              vdwjidx3A,vdwjidx3B;
+    _fjsp_v2r8       jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
+    _fjsp_v2r8       dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+    _fjsp_v2r8       dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+    _fjsp_v2r8       dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
+    _fjsp_v2r8       dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+    _fjsp_v2r8       dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+    _fjsp_v2r8       dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
+    _fjsp_v2r8       dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
+    _fjsp_v2r8       dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
+    _fjsp_v2r8       dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
+    _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    _fjsp_v2r8       itab_tmp;
+    _fjsp_v2r8       dummy_mask,cutoff_mask;
+    _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+    _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+    union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    krf              = gmx_fjsp_set1_v2r8(fr->ic->k_rf);
+    krf2             = gmx_fjsp_set1_v2r8(fr->ic->k_rf*2.0);
+    crf              = gmx_fjsp_set1_v2r8(fr->ic->c_rf);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+    iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+    iq3              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+3]));
+
+    jq1              = gmx_fjsp_set1_v2r8(charge[inr+1]);
+    jq2              = gmx_fjsp_set1_v2r8(charge[inr+2]);
+    jq3              = gmx_fjsp_set1_v2r8(charge[inr+3]);
+    qq11             = _fjsp_mul_v2r8(iq1,jq1);
+    qq12             = _fjsp_mul_v2r8(iq1,jq2);
+    qq13             = _fjsp_mul_v2r8(iq1,jq3);
+    qq21             = _fjsp_mul_v2r8(iq2,jq1);
+    qq22             = _fjsp_mul_v2r8(iq2,jq2);
+    qq23             = _fjsp_mul_v2r8(iq2,jq3);
+    qq31             = _fjsp_mul_v2r8(iq3,jq1);
+    qq32             = _fjsp_mul_v2r8(iq3,jq2);
+    qq33             = _fjsp_mul_v2r8(iq3,jq3);
+
+    /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+    rcutoff_scalar   = fr->rcoulomb;
+    rcutoff          = gmx_fjsp_set1_v2r8(rcutoff_scalar);
+    rcutoff2         = _fjsp_mul_v2r8(rcutoff,rcutoff);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_fjsp_load_shift_and_3rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset+DIM,
+                                                 &ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
+
+        fix1             = _fjsp_setzero_v2r8();
+        fiy1             = _fjsp_setzero_v2r8();
+        fiz1             = _fjsp_setzero_v2r8();
+        fix2             = _fjsp_setzero_v2r8();
+        fiy2             = _fjsp_setzero_v2r8();
+        fiz2             = _fjsp_setzero_v2r8();
+        fix3             = _fjsp_setzero_v2r8();
+        fiy3             = _fjsp_setzero_v2r8();
+        fiz3             = _fjsp_setzero_v2r8();
+
+        /* Reset potential sums */
+        velecsum         = _fjsp_setzero_v2r8();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_3rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA+DIM,x+j_coord_offsetB+DIM,
+                                              &jx1,&jy1,&jz1,&jx2,&jy2,&jz2,&jx3,&jy3,&jz3);
+
+            /* Calculate displacement vector */
+            dx11             = _fjsp_sub_v2r8(ix1,jx1);
+            dy11             = _fjsp_sub_v2r8(iy1,jy1);
+            dz11             = _fjsp_sub_v2r8(iz1,jz1);
+            dx12             = _fjsp_sub_v2r8(ix1,jx2);
+            dy12             = _fjsp_sub_v2r8(iy1,jy2);
+            dz12             = _fjsp_sub_v2r8(iz1,jz2);
+            dx13             = _fjsp_sub_v2r8(ix1,jx3);
+            dy13             = _fjsp_sub_v2r8(iy1,jy3);
+            dz13             = _fjsp_sub_v2r8(iz1,jz3);
+            dx21             = _fjsp_sub_v2r8(ix2,jx1);
+            dy21             = _fjsp_sub_v2r8(iy2,jy1);
+            dz21             = _fjsp_sub_v2r8(iz2,jz1);
+            dx22             = _fjsp_sub_v2r8(ix2,jx2);
+            dy22             = _fjsp_sub_v2r8(iy2,jy2);
+            dz22             = _fjsp_sub_v2r8(iz2,jz2);
+            dx23             = _fjsp_sub_v2r8(ix2,jx3);
+            dy23             = _fjsp_sub_v2r8(iy2,jy3);
+            dz23             = _fjsp_sub_v2r8(iz2,jz3);
+            dx31             = _fjsp_sub_v2r8(ix3,jx1);
+            dy31             = _fjsp_sub_v2r8(iy3,jy1);
+            dz31             = _fjsp_sub_v2r8(iz3,jz1);
+            dx32             = _fjsp_sub_v2r8(ix3,jx2);
+            dy32             = _fjsp_sub_v2r8(iy3,jy2);
+            dz32             = _fjsp_sub_v2r8(iz3,jz2);
+            dx33             = _fjsp_sub_v2r8(ix3,jx3);
+            dy33             = _fjsp_sub_v2r8(iy3,jy3);
+            dz33             = _fjsp_sub_v2r8(iz3,jz3);
+
+            /* Calculate squared distance and things based on it */
+            rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+            rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+            rsq13            = gmx_fjsp_calc_rsq_v2r8(dx13,dy13,dz13);
+            rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+            rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+            rsq23            = gmx_fjsp_calc_rsq_v2r8(dx23,dy23,dz23);
+            rsq31            = gmx_fjsp_calc_rsq_v2r8(dx31,dy31,dz31);
+            rsq32            = gmx_fjsp_calc_rsq_v2r8(dx32,dy32,dz32);
+            rsq33            = gmx_fjsp_calc_rsq_v2r8(dx33,dy33,dz33);
+
+            rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+            rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+            rinv13           = gmx_fjsp_invsqrt_v2r8(rsq13);
+            rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+            rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+            rinv23           = gmx_fjsp_invsqrt_v2r8(rsq23);
+            rinv31           = gmx_fjsp_invsqrt_v2r8(rsq31);
+            rinv32           = gmx_fjsp_invsqrt_v2r8(rsq32);
+            rinv33           = gmx_fjsp_invsqrt_v2r8(rsq33);
+
+            rinvsq11         = _fjsp_mul_v2r8(rinv11,rinv11);
+            rinvsq12         = _fjsp_mul_v2r8(rinv12,rinv12);
+            rinvsq13         = _fjsp_mul_v2r8(rinv13,rinv13);
+            rinvsq21         = _fjsp_mul_v2r8(rinv21,rinv21);
+            rinvsq22         = _fjsp_mul_v2r8(rinv22,rinv22);
+            rinvsq23         = _fjsp_mul_v2r8(rinv23,rinv23);
+            rinvsq31         = _fjsp_mul_v2r8(rinv31,rinv31);
+            rinvsq32         = _fjsp_mul_v2r8(rinv32,rinv32);
+            rinvsq33         = _fjsp_mul_v2r8(rinv33,rinv33);
+
+            fjx1             = _fjsp_setzero_v2r8();
+            fjy1             = _fjsp_setzero_v2r8();
+            fjz1             = _fjsp_setzero_v2r8();
+            fjx2             = _fjsp_setzero_v2r8();
+            fjy2             = _fjsp_setzero_v2r8();
+            fjz2             = _fjsp_setzero_v2r8();
+            fjx3             = _fjsp_setzero_v2r8();
+            fjy3             = _fjsp_setzero_v2r8();
+            fjz3             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq11,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq11,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq11,rinv11),crf));
+            felec            = _fjsp_mul_v2r8(qq11,_fjsp_msub_v2r8(rinv11,rinvsq11,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq11,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+            
+            fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq12,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq12,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq12,rinv12),crf));
+            felec            = _fjsp_mul_v2r8(qq12,_fjsp_msub_v2r8(rinv12,rinvsq12,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq12,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+            
+            fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq13,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq13,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq13,rinv13),crf));
+            felec            = _fjsp_mul_v2r8(qq13,_fjsp_msub_v2r8(rinv13,rinvsq13,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq13,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx13,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy13,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz13,fscal,fiz1);
+            
+            fjx3             = _fjsp_madd_v2r8(dx13,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy13,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz13,fscal,fjz3);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq21,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq21,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq21,rinv21),crf));
+            felec            = _fjsp_mul_v2r8(qq21,_fjsp_msub_v2r8(rinv21,rinvsq21,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq21,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+            
+            fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq22,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq22,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq22,rinv22),crf));
+            felec            = _fjsp_mul_v2r8(qq22,_fjsp_msub_v2r8(rinv22,rinvsq22,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq22,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+            
+            fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq23,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq23,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq23,rinv23),crf));
+            felec            = _fjsp_mul_v2r8(qq23,_fjsp_msub_v2r8(rinv23,rinvsq23,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq23,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx23,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy23,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz23,fscal,fiz2);
+            
+            fjx3             = _fjsp_madd_v2r8(dx23,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy23,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz23,fscal,fjz3);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq31,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq31,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq31,rinv31),crf));
+            felec            = _fjsp_mul_v2r8(qq31,_fjsp_msub_v2r8(rinv31,rinvsq31,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq31,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx31,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy31,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz31,fscal,fiz3);
+            
+            fjx1             = _fjsp_madd_v2r8(dx31,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy31,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz31,fscal,fjz1);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq32,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq32,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq32,rinv32),crf));
+            felec            = _fjsp_mul_v2r8(qq32,_fjsp_msub_v2r8(rinv32,rinvsq32,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq32,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx32,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy32,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz32,fscal,fiz3);
+            
+            fjx2             = _fjsp_madd_v2r8(dx32,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy32,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz32,fscal,fjz2);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq33,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq33,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq33,rinv33),crf));
+            felec            = _fjsp_mul_v2r8(qq33,_fjsp_msub_v2r8(rinv33,rinvsq33,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq33,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx33,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy33,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz33,fscal,fiz3);
+            
+            fjx3             = _fjsp_madd_v2r8(dx33,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy33,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz33,fscal,fjz3);
+
+            }
+
+            gmx_fjsp_decrement_3rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA+DIM,f+j_coord_offsetB+DIM,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+
+            /* Inner loop uses 351 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_3rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA+DIM,
+                                              &jx1,&jy1,&jz1,&jx2,&jy2,&jz2,&jx3,&jy3,&jz3);
+
+            /* Calculate displacement vector */
+            dx11             = _fjsp_sub_v2r8(ix1,jx1);
+            dy11             = _fjsp_sub_v2r8(iy1,jy1);
+            dz11             = _fjsp_sub_v2r8(iz1,jz1);
+            dx12             = _fjsp_sub_v2r8(ix1,jx2);
+            dy12             = _fjsp_sub_v2r8(iy1,jy2);
+            dz12             = _fjsp_sub_v2r8(iz1,jz2);
+            dx13             = _fjsp_sub_v2r8(ix1,jx3);
+            dy13             = _fjsp_sub_v2r8(iy1,jy3);
+            dz13             = _fjsp_sub_v2r8(iz1,jz3);
+            dx21             = _fjsp_sub_v2r8(ix2,jx1);
+            dy21             = _fjsp_sub_v2r8(iy2,jy1);
+            dz21             = _fjsp_sub_v2r8(iz2,jz1);
+            dx22             = _fjsp_sub_v2r8(ix2,jx2);
+            dy22             = _fjsp_sub_v2r8(iy2,jy2);
+            dz22             = _fjsp_sub_v2r8(iz2,jz2);
+            dx23             = _fjsp_sub_v2r8(ix2,jx3);
+            dy23             = _fjsp_sub_v2r8(iy2,jy3);
+            dz23             = _fjsp_sub_v2r8(iz2,jz3);
+            dx31             = _fjsp_sub_v2r8(ix3,jx1);
+            dy31             = _fjsp_sub_v2r8(iy3,jy1);
+            dz31             = _fjsp_sub_v2r8(iz3,jz1);
+            dx32             = _fjsp_sub_v2r8(ix3,jx2);
+            dy32             = _fjsp_sub_v2r8(iy3,jy2);
+            dz32             = _fjsp_sub_v2r8(iz3,jz2);
+            dx33             = _fjsp_sub_v2r8(ix3,jx3);
+            dy33             = _fjsp_sub_v2r8(iy3,jy3);
+            dz33             = _fjsp_sub_v2r8(iz3,jz3);
+
+            /* Calculate squared distance and things based on it */
+            rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+            rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+            rsq13            = gmx_fjsp_calc_rsq_v2r8(dx13,dy13,dz13);
+            rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+            rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+            rsq23            = gmx_fjsp_calc_rsq_v2r8(dx23,dy23,dz23);
+            rsq31            = gmx_fjsp_calc_rsq_v2r8(dx31,dy31,dz31);
+            rsq32            = gmx_fjsp_calc_rsq_v2r8(dx32,dy32,dz32);
+            rsq33            = gmx_fjsp_calc_rsq_v2r8(dx33,dy33,dz33);
+
+            rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+            rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+            rinv13           = gmx_fjsp_invsqrt_v2r8(rsq13);
+            rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+            rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+            rinv23           = gmx_fjsp_invsqrt_v2r8(rsq23);
+            rinv31           = gmx_fjsp_invsqrt_v2r8(rsq31);
+            rinv32           = gmx_fjsp_invsqrt_v2r8(rsq32);
+            rinv33           = gmx_fjsp_invsqrt_v2r8(rsq33);
+
+            rinvsq11         = _fjsp_mul_v2r8(rinv11,rinv11);
+            rinvsq12         = _fjsp_mul_v2r8(rinv12,rinv12);
+            rinvsq13         = _fjsp_mul_v2r8(rinv13,rinv13);
+            rinvsq21         = _fjsp_mul_v2r8(rinv21,rinv21);
+            rinvsq22         = _fjsp_mul_v2r8(rinv22,rinv22);
+            rinvsq23         = _fjsp_mul_v2r8(rinv23,rinv23);
+            rinvsq31         = _fjsp_mul_v2r8(rinv31,rinv31);
+            rinvsq32         = _fjsp_mul_v2r8(rinv32,rinv32);
+            rinvsq33         = _fjsp_mul_v2r8(rinv33,rinv33);
+
+            fjx1             = _fjsp_setzero_v2r8();
+            fjy1             = _fjsp_setzero_v2r8();
+            fjz1             = _fjsp_setzero_v2r8();
+            fjx2             = _fjsp_setzero_v2r8();
+            fjy2             = _fjsp_setzero_v2r8();
+            fjz2             = _fjsp_setzero_v2r8();
+            fjx3             = _fjsp_setzero_v2r8();
+            fjy3             = _fjsp_setzero_v2r8();
+            fjz3             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq11,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq11,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq11,rinv11),crf));
+            felec            = _fjsp_mul_v2r8(qq11,_fjsp_msub_v2r8(rinv11,rinvsq11,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq11,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+            
+            fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq12,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq12,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq12,rinv12),crf));
+            felec            = _fjsp_mul_v2r8(qq12,_fjsp_msub_v2r8(rinv12,rinvsq12,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq12,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+            
+            fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq13,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq13,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq13,rinv13),crf));
+            felec            = _fjsp_mul_v2r8(qq13,_fjsp_msub_v2r8(rinv13,rinvsq13,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq13,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx13,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy13,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz13,fscal,fiz1);
+            
+            fjx3             = _fjsp_madd_v2r8(dx13,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy13,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz13,fscal,fjz3);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq21,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq21,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq21,rinv21),crf));
+            felec            = _fjsp_mul_v2r8(qq21,_fjsp_msub_v2r8(rinv21,rinvsq21,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq21,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+            
+            fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq22,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq22,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq22,rinv22),crf));
+            felec            = _fjsp_mul_v2r8(qq22,_fjsp_msub_v2r8(rinv22,rinvsq22,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq22,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+            
+            fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq23,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq23,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq23,rinv23),crf));
+            felec            = _fjsp_mul_v2r8(qq23,_fjsp_msub_v2r8(rinv23,rinvsq23,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq23,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx23,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy23,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz23,fscal,fiz2);
+            
+            fjx3             = _fjsp_madd_v2r8(dx23,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy23,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz23,fscal,fjz3);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq31,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq31,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq31,rinv31),crf));
+            felec            = _fjsp_mul_v2r8(qq31,_fjsp_msub_v2r8(rinv31,rinvsq31,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq31,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx31,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy31,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz31,fscal,fiz3);
+            
+            fjx1             = _fjsp_madd_v2r8(dx31,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy31,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz31,fscal,fjz1);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq32,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq32,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq32,rinv32),crf));
+            felec            = _fjsp_mul_v2r8(qq32,_fjsp_msub_v2r8(rinv32,rinvsq32,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq32,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx32,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy32,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz32,fscal,fiz3);
+            
+            fjx2             = _fjsp_madd_v2r8(dx32,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy32,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz32,fscal,fjz2);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq33,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq33,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq33,rinv33),crf));
+            felec            = _fjsp_mul_v2r8(qq33,_fjsp_msub_v2r8(rinv33,rinvsq33,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq33,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx33,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy33,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz33,fscal,fiz3);
+            
+            fjx3             = _fjsp_madd_v2r8(dx33,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy33,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz33,fscal,fjz3);
+
+            }
+
+            gmx_fjsp_decrement_3rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA+DIM,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+
+            /* Inner loop uses 351 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_fjsp_update_iforce_3atom_swizzle_v2r8(fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+                                              f+i_coord_offset+DIM,fshift+i_shift_offset);
+
+        ggid                        = gid[iidx];
+        /* Update potential energies */
+        gmx_fjsp_update_1pot_v2r8(velecsum,kernel_data->energygrp_elec+ggid);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 19 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W4W4_VF,outeriter*19 + inneriter*351);
+}
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecRFCut_VdwNone_GeomW4W4_F_sparc64_hpc_ace_double
+ * Electrostatics interaction: ReactionField
+ * VdW interaction:            None
+ * Geometry:                   Water4-Water4
+ * Calculate force/pot:        Force
+ */
+void
+nb_kernel_ElecRFCut_VdwNone_GeomW4W4_F_sparc64_hpc_ace_double
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset1;
+    _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwioffset3;
+    _fjsp_v2r8       ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+    int              vdwjidx1A,vdwjidx1B;
+    _fjsp_v2r8       jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+    int              vdwjidx2A,vdwjidx2B;
+    _fjsp_v2r8       jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+    int              vdwjidx3A,vdwjidx3B;
+    _fjsp_v2r8       jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
+    _fjsp_v2r8       dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+    _fjsp_v2r8       dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+    _fjsp_v2r8       dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
+    _fjsp_v2r8       dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+    _fjsp_v2r8       dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+    _fjsp_v2r8       dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
+    _fjsp_v2r8       dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
+    _fjsp_v2r8       dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
+    _fjsp_v2r8       dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
+    _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    _fjsp_v2r8       itab_tmp;
+    _fjsp_v2r8       dummy_mask,cutoff_mask;
+    _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+    _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+    union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    krf              = gmx_fjsp_set1_v2r8(fr->ic->k_rf);
+    krf2             = gmx_fjsp_set1_v2r8(fr->ic->k_rf*2.0);
+    crf              = gmx_fjsp_set1_v2r8(fr->ic->c_rf);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+    iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+    iq3              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+3]));
+
+    jq1              = gmx_fjsp_set1_v2r8(charge[inr+1]);
+    jq2              = gmx_fjsp_set1_v2r8(charge[inr+2]);
+    jq3              = gmx_fjsp_set1_v2r8(charge[inr+3]);
+    qq11             = _fjsp_mul_v2r8(iq1,jq1);
+    qq12             = _fjsp_mul_v2r8(iq1,jq2);
+    qq13             = _fjsp_mul_v2r8(iq1,jq3);
+    qq21             = _fjsp_mul_v2r8(iq2,jq1);
+    qq22             = _fjsp_mul_v2r8(iq2,jq2);
+    qq23             = _fjsp_mul_v2r8(iq2,jq3);
+    qq31             = _fjsp_mul_v2r8(iq3,jq1);
+    qq32             = _fjsp_mul_v2r8(iq3,jq2);
+    qq33             = _fjsp_mul_v2r8(iq3,jq3);
+
+    /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+    rcutoff_scalar   = fr->rcoulomb;
+    rcutoff          = gmx_fjsp_set1_v2r8(rcutoff_scalar);
+    rcutoff2         = _fjsp_mul_v2r8(rcutoff,rcutoff);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_fjsp_load_shift_and_3rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset+DIM,
+                                                 &ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
+
+        fix1             = _fjsp_setzero_v2r8();
+        fiy1             = _fjsp_setzero_v2r8();
+        fiz1             = _fjsp_setzero_v2r8();
+        fix2             = _fjsp_setzero_v2r8();
+        fiy2             = _fjsp_setzero_v2r8();
+        fiz2             = _fjsp_setzero_v2r8();
+        fix3             = _fjsp_setzero_v2r8();
+        fiy3             = _fjsp_setzero_v2r8();
+        fiz3             = _fjsp_setzero_v2r8();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_3rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA+DIM,x+j_coord_offsetB+DIM,
+                                              &jx1,&jy1,&jz1,&jx2,&jy2,&jz2,&jx3,&jy3,&jz3);
+
+            /* Calculate displacement vector */
+            dx11             = _fjsp_sub_v2r8(ix1,jx1);
+            dy11             = _fjsp_sub_v2r8(iy1,jy1);
+            dz11             = _fjsp_sub_v2r8(iz1,jz1);
+            dx12             = _fjsp_sub_v2r8(ix1,jx2);
+            dy12             = _fjsp_sub_v2r8(iy1,jy2);
+            dz12             = _fjsp_sub_v2r8(iz1,jz2);
+            dx13             = _fjsp_sub_v2r8(ix1,jx3);
+            dy13             = _fjsp_sub_v2r8(iy1,jy3);
+            dz13             = _fjsp_sub_v2r8(iz1,jz3);
+            dx21             = _fjsp_sub_v2r8(ix2,jx1);
+            dy21             = _fjsp_sub_v2r8(iy2,jy1);
+            dz21             = _fjsp_sub_v2r8(iz2,jz1);
+            dx22             = _fjsp_sub_v2r8(ix2,jx2);
+            dy22             = _fjsp_sub_v2r8(iy2,jy2);
+            dz22             = _fjsp_sub_v2r8(iz2,jz2);
+            dx23             = _fjsp_sub_v2r8(ix2,jx3);
+            dy23             = _fjsp_sub_v2r8(iy2,jy3);
+            dz23             = _fjsp_sub_v2r8(iz2,jz3);
+            dx31             = _fjsp_sub_v2r8(ix3,jx1);
+            dy31             = _fjsp_sub_v2r8(iy3,jy1);
+            dz31             = _fjsp_sub_v2r8(iz3,jz1);
+            dx32             = _fjsp_sub_v2r8(ix3,jx2);
+            dy32             = _fjsp_sub_v2r8(iy3,jy2);
+            dz32             = _fjsp_sub_v2r8(iz3,jz2);
+            dx33             = _fjsp_sub_v2r8(ix3,jx3);
+            dy33             = _fjsp_sub_v2r8(iy3,jy3);
+            dz33             = _fjsp_sub_v2r8(iz3,jz3);
+
+            /* Calculate squared distance and things based on it */
+            rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+            rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+            rsq13            = gmx_fjsp_calc_rsq_v2r8(dx13,dy13,dz13);
+            rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+            rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+            rsq23            = gmx_fjsp_calc_rsq_v2r8(dx23,dy23,dz23);
+            rsq31            = gmx_fjsp_calc_rsq_v2r8(dx31,dy31,dz31);
+            rsq32            = gmx_fjsp_calc_rsq_v2r8(dx32,dy32,dz32);
+            rsq33            = gmx_fjsp_calc_rsq_v2r8(dx33,dy33,dz33);
+
+            rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+            rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+            rinv13           = gmx_fjsp_invsqrt_v2r8(rsq13);
+            rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+            rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+            rinv23           = gmx_fjsp_invsqrt_v2r8(rsq23);
+            rinv31           = gmx_fjsp_invsqrt_v2r8(rsq31);
+            rinv32           = gmx_fjsp_invsqrt_v2r8(rsq32);
+            rinv33           = gmx_fjsp_invsqrt_v2r8(rsq33);
+
+            rinvsq11         = _fjsp_mul_v2r8(rinv11,rinv11);
+            rinvsq12         = _fjsp_mul_v2r8(rinv12,rinv12);
+            rinvsq13         = _fjsp_mul_v2r8(rinv13,rinv13);
+            rinvsq21         = _fjsp_mul_v2r8(rinv21,rinv21);
+            rinvsq22         = _fjsp_mul_v2r8(rinv22,rinv22);
+            rinvsq23         = _fjsp_mul_v2r8(rinv23,rinv23);
+            rinvsq31         = _fjsp_mul_v2r8(rinv31,rinv31);
+            rinvsq32         = _fjsp_mul_v2r8(rinv32,rinv32);
+            rinvsq33         = _fjsp_mul_v2r8(rinv33,rinv33);
+
+            fjx1             = _fjsp_setzero_v2r8();
+            fjy1             = _fjsp_setzero_v2r8();
+            fjz1             = _fjsp_setzero_v2r8();
+            fjx2             = _fjsp_setzero_v2r8();
+            fjy2             = _fjsp_setzero_v2r8();
+            fjz2             = _fjsp_setzero_v2r8();
+            fjx3             = _fjsp_setzero_v2r8();
+            fjy3             = _fjsp_setzero_v2r8();
+            fjz3             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq11,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq11,_fjsp_msub_v2r8(rinv11,rinvsq11,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq11,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+            
+            fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq12,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq12,_fjsp_msub_v2r8(rinv12,rinvsq12,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq12,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+            
+            fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq13,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq13,_fjsp_msub_v2r8(rinv13,rinvsq13,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq13,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx13,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy13,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz13,fscal,fiz1);
+            
+            fjx3             = _fjsp_madd_v2r8(dx13,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy13,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz13,fscal,fjz3);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq21,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq21,_fjsp_msub_v2r8(rinv21,rinvsq21,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq21,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+            
+            fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq22,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq22,_fjsp_msub_v2r8(rinv22,rinvsq22,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq22,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+            
+            fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq23,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq23,_fjsp_msub_v2r8(rinv23,rinvsq23,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq23,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx23,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy23,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz23,fscal,fiz2);
+            
+            fjx3             = _fjsp_madd_v2r8(dx23,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy23,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz23,fscal,fjz3);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq31,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq31,_fjsp_msub_v2r8(rinv31,rinvsq31,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq31,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx31,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy31,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz31,fscal,fiz3);
+            
+            fjx1             = _fjsp_madd_v2r8(dx31,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy31,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz31,fscal,fjz1);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq32,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq32,_fjsp_msub_v2r8(rinv32,rinvsq32,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq32,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx32,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy32,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz32,fscal,fiz3);
+            
+            fjx2             = _fjsp_madd_v2r8(dx32,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy32,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz32,fscal,fjz2);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq33,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq33,_fjsp_msub_v2r8(rinv33,rinvsq33,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq33,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx33,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy33,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz33,fscal,fiz3);
+            
+            fjx3             = _fjsp_madd_v2r8(dx33,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy33,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz33,fscal,fjz3);
+
+            }
+
+            gmx_fjsp_decrement_3rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA+DIM,f+j_coord_offsetB+DIM,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+
+            /* Inner loop uses 297 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_3rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA+DIM,
+                                              &jx1,&jy1,&jz1,&jx2,&jy2,&jz2,&jx3,&jy3,&jz3);
+
+            /* Calculate displacement vector */
+            dx11             = _fjsp_sub_v2r8(ix1,jx1);
+            dy11             = _fjsp_sub_v2r8(iy1,jy1);
+            dz11             = _fjsp_sub_v2r8(iz1,jz1);
+            dx12             = _fjsp_sub_v2r8(ix1,jx2);
+            dy12             = _fjsp_sub_v2r8(iy1,jy2);
+            dz12             = _fjsp_sub_v2r8(iz1,jz2);
+            dx13             = _fjsp_sub_v2r8(ix1,jx3);
+            dy13             = _fjsp_sub_v2r8(iy1,jy3);
+            dz13             = _fjsp_sub_v2r8(iz1,jz3);
+            dx21             = _fjsp_sub_v2r8(ix2,jx1);
+            dy21             = _fjsp_sub_v2r8(iy2,jy1);
+            dz21             = _fjsp_sub_v2r8(iz2,jz1);
+            dx22             = _fjsp_sub_v2r8(ix2,jx2);
+            dy22             = _fjsp_sub_v2r8(iy2,jy2);
+            dz22             = _fjsp_sub_v2r8(iz2,jz2);
+            dx23             = _fjsp_sub_v2r8(ix2,jx3);
+            dy23             = _fjsp_sub_v2r8(iy2,jy3);
+            dz23             = _fjsp_sub_v2r8(iz2,jz3);
+            dx31             = _fjsp_sub_v2r8(ix3,jx1);
+            dy31             = _fjsp_sub_v2r8(iy3,jy1);
+            dz31             = _fjsp_sub_v2r8(iz3,jz1);
+            dx32             = _fjsp_sub_v2r8(ix3,jx2);
+            dy32             = _fjsp_sub_v2r8(iy3,jy2);
+            dz32             = _fjsp_sub_v2r8(iz3,jz2);
+            dx33             = _fjsp_sub_v2r8(ix3,jx3);
+            dy33             = _fjsp_sub_v2r8(iy3,jy3);
+            dz33             = _fjsp_sub_v2r8(iz3,jz3);
+
+            /* Calculate squared distance and things based on it */
+            rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+            rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+            rsq13            = gmx_fjsp_calc_rsq_v2r8(dx13,dy13,dz13);
+            rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+            rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+            rsq23            = gmx_fjsp_calc_rsq_v2r8(dx23,dy23,dz23);
+            rsq31            = gmx_fjsp_calc_rsq_v2r8(dx31,dy31,dz31);
+            rsq32            = gmx_fjsp_calc_rsq_v2r8(dx32,dy32,dz32);
+            rsq33            = gmx_fjsp_calc_rsq_v2r8(dx33,dy33,dz33);
+
+            rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+            rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+            rinv13           = gmx_fjsp_invsqrt_v2r8(rsq13);
+            rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+            rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+            rinv23           = gmx_fjsp_invsqrt_v2r8(rsq23);
+            rinv31           = gmx_fjsp_invsqrt_v2r8(rsq31);
+            rinv32           = gmx_fjsp_invsqrt_v2r8(rsq32);
+            rinv33           = gmx_fjsp_invsqrt_v2r8(rsq33);
+
+            rinvsq11         = _fjsp_mul_v2r8(rinv11,rinv11);
+            rinvsq12         = _fjsp_mul_v2r8(rinv12,rinv12);
+            rinvsq13         = _fjsp_mul_v2r8(rinv13,rinv13);
+            rinvsq21         = _fjsp_mul_v2r8(rinv21,rinv21);
+            rinvsq22         = _fjsp_mul_v2r8(rinv22,rinv22);
+            rinvsq23         = _fjsp_mul_v2r8(rinv23,rinv23);
+            rinvsq31         = _fjsp_mul_v2r8(rinv31,rinv31);
+            rinvsq32         = _fjsp_mul_v2r8(rinv32,rinv32);
+            rinvsq33         = _fjsp_mul_v2r8(rinv33,rinv33);
+
+            fjx1             = _fjsp_setzero_v2r8();
+            fjy1             = _fjsp_setzero_v2r8();
+            fjz1             = _fjsp_setzero_v2r8();
+            fjx2             = _fjsp_setzero_v2r8();
+            fjy2             = _fjsp_setzero_v2r8();
+            fjz2             = _fjsp_setzero_v2r8();
+            fjx3             = _fjsp_setzero_v2r8();
+            fjy3             = _fjsp_setzero_v2r8();
+            fjz3             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq11,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq11,_fjsp_msub_v2r8(rinv11,rinvsq11,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq11,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+            
+            fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq12,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq12,_fjsp_msub_v2r8(rinv12,rinvsq12,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq12,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+            
+            fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq13,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq13,_fjsp_msub_v2r8(rinv13,rinvsq13,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq13,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx13,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy13,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz13,fscal,fiz1);
+            
+            fjx3             = _fjsp_madd_v2r8(dx13,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy13,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz13,fscal,fjz3);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq21,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq21,_fjsp_msub_v2r8(rinv21,rinvsq21,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq21,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+            
+            fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq22,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq22,_fjsp_msub_v2r8(rinv22,rinvsq22,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq22,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+            
+            fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq23,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq23,_fjsp_msub_v2r8(rinv23,rinvsq23,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq23,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx23,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy23,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz23,fscal,fiz2);
+            
+            fjx3             = _fjsp_madd_v2r8(dx23,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy23,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz23,fscal,fjz3);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq31,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq31,_fjsp_msub_v2r8(rinv31,rinvsq31,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq31,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx31,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy31,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz31,fscal,fiz3);
+            
+            fjx1             = _fjsp_madd_v2r8(dx31,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy31,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz31,fscal,fjz1);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq32,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq32,_fjsp_msub_v2r8(rinv32,rinvsq32,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq32,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx32,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy32,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz32,fscal,fiz3);
+            
+            fjx2             = _fjsp_madd_v2r8(dx32,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy32,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz32,fscal,fjz2);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq33,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq33,_fjsp_msub_v2r8(rinv33,rinvsq33,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq33,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx33,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy33,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz33,fscal,fiz3);
+            
+            fjx3             = _fjsp_madd_v2r8(dx33,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy33,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz33,fscal,fjz3);
+
+            }
+
+            gmx_fjsp_decrement_3rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA+DIM,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+
+            /* Inner loop uses 297 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_fjsp_update_iforce_3atom_swizzle_v2r8(fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+                                              f+i_coord_offset+DIM,fshift+i_shift_offset);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 18 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W4W4_F,outeriter*18 + inneriter*297);
+}
diff --git a/src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecRF_VdwCSTab_GeomP1P1_sparc64_hpc_ace_double.c b/src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecRF_VdwCSTab_GeomP1P1_sparc64_hpc_ace_double.c
new file mode 100644 (file)
index 0000000..32d447f
--- /dev/null
@@ -0,0 +1,683 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2012, by the GROMACS development team, led by
+ * David van der Spoel, Berk Hess, Erik Lindahl, and including many
+ * others, as listed in the AUTHORS file in the top-level source
+ * directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*
+ * Note: this file was generated by the GROMACS sparc64_hpc_ace_double kernel generator.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "kernelutil_sparc64_hpc_ace_double.h"
+
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecRF_VdwCSTab_GeomP1P1_VF_sparc64_hpc_ace_double
+ * Electrostatics interaction: ReactionField
+ * VdW interaction:            CubicSplineTable
+ * Geometry:                   Particle-Particle
+ * Calculate force/pot:        PotentialAndForce
+ */
+void
+nb_kernel_ElecRF_VdwCSTab_GeomP1P1_VF_sparc64_hpc_ace_double
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwjidx0A,vdwjidx0B;
+    _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
+    _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
+    _fjsp_v2r8       rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF,twovfeps;
+    real             *vftab;
+    _fjsp_v2r8       itab_tmp;
+    _fjsp_v2r8       dummy_mask,cutoff_mask;
+    _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+    _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+    union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    krf              = gmx_fjsp_set1_v2r8(fr->ic->k_rf);
+    krf2             = gmx_fjsp_set1_v2r8(fr->ic->k_rf*2.0);
+    crf              = gmx_fjsp_set1_v2r8(fr->ic->c_rf);
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+
+    vftab            = kernel_data->table_vdw->data;
+    vftabscale       = gmx_fjsp_set1_v2r8(kernel_data->table_vdw->scale);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_fjsp_load_shift_and_1rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,&ix0,&iy0,&iz0);
+
+        fix0             = _fjsp_setzero_v2r8();
+        fiy0             = _fjsp_setzero_v2r8();
+        fiz0             = _fjsp_setzero_v2r8();
+
+        /* Load parameters for i particles */
+        iq0              = _fjsp_mul_v2r8(facel,gmx_fjsp_load1_v2r8(charge+inr+0));
+        vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+        /* Reset potential sums */
+        velecsum         = _fjsp_setzero_v2r8();
+        vvdwsum          = _fjsp_setzero_v2r8();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_fjsp_load_2real_swizzle_v2r8(charge+jnrA+0,charge+jnrB+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _fjsp_mul_v2r8(iq0,jq0);
+            gmx_fjsp_load_2pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,&c6_00,&c12_00);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r00,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 8;
+            vfconv.i[1]     *= 8;
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq00,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq00,rinv00),crf));
+            felec            = _fjsp_mul_v2r8(qq00,_fjsp_msub_v2r8(rinv00,rinvsq00,krf2));
+
+            /* CUBIC SPLINE TABLE DISPERSION */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 2 );
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 2 );
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            vvdw6            = _fjsp_mul_v2r8(c6_00,VV);
+            FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+            fvdw6            = _fjsp_mul_v2r8(c6_00,FF);
+
+            /* CUBIC SPLINE TABLE REPULSION */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 4 );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 4 );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 6 );
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 6 );
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            vvdw12           = _fjsp_mul_v2r8(c12_00,VV);
+            FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+            fvdw12           = _fjsp_mul_v2r8(c12_00,FF);
+            vvdw             = _fjsp_add_v2r8(vvdw12,vvdw6);
+            fvdw             = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_add_v2r8(fvdw6,fvdw12),_fjsp_mul_v2r8(vftabscale,rinv00)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+            vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
+
+            fscal            = _fjsp_add_v2r8(felec,fvdw);
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            gmx_fjsp_decrement_fma_1rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fscal,dx00,dy00,dz00);
+
+            /* Inner loop uses 70 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+
+            /* Load parameters for j particles */
+            jq0              = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),charge+jnrA+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _fjsp_mul_v2r8(iq0,jq0);
+            gmx_fjsp_load_1pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,&c6_00,&c12_00);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r00,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 8;
+            vfconv.i[1]     *= 8;
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq00,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq00,rinv00),crf));
+            felec            = _fjsp_mul_v2r8(qq00,_fjsp_msub_v2r8(rinv00,rinvsq00,krf2));
+
+            /* CUBIC SPLINE TABLE DISPERSION */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 2 );
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            vvdw6            = _fjsp_mul_v2r8(c6_00,VV);
+            FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+            fvdw6            = _fjsp_mul_v2r8(c6_00,FF);
+
+            /* CUBIC SPLINE TABLE REPULSION */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 4 );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 6 );
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            vvdw12           = _fjsp_mul_v2r8(c12_00,VV);
+            FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+            fvdw12           = _fjsp_mul_v2r8(c12_00,FF);
+            vvdw             = _fjsp_add_v2r8(vvdw12,vvdw6);
+            fvdw             = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_add_v2r8(fvdw6,fvdw12),_fjsp_mul_v2r8(vftabscale,rinv00)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+            vvdw             = _fjsp_unpacklo_v2r8(vvdw,_fjsp_setzero_v2r8());
+            vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
+
+            fscal            = _fjsp_add_v2r8(felec,fvdw);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            gmx_fjsp_decrement_fma_1rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fscal,dx00,dy00,dz00);
+
+            /* Inner loop uses 70 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_fjsp_update_iforce_1atom_swizzle_v2r8(fix0,fiy0,fiz0,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        ggid                        = gid[iidx];
+        /* Update potential energies */
+        gmx_fjsp_update_1pot_v2r8(velecsum,kernel_data->energygrp_elec+ggid);
+        gmx_fjsp_update_1pot_v2r8(vvdwsum,kernel_data->energygrp_vdw+ggid);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 9 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_VF,outeriter*9 + inneriter*70);
+}
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecRF_VdwCSTab_GeomP1P1_F_sparc64_hpc_ace_double
+ * Electrostatics interaction: ReactionField
+ * VdW interaction:            CubicSplineTable
+ * Geometry:                   Particle-Particle
+ * Calculate force/pot:        Force
+ */
+void
+nb_kernel_ElecRF_VdwCSTab_GeomP1P1_F_sparc64_hpc_ace_double
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwjidx0A,vdwjidx0B;
+    _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
+    _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
+    _fjsp_v2r8       rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF,twovfeps;
+    real             *vftab;
+    _fjsp_v2r8       itab_tmp;
+    _fjsp_v2r8       dummy_mask,cutoff_mask;
+    _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+    _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+    union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    krf              = gmx_fjsp_set1_v2r8(fr->ic->k_rf);
+    krf2             = gmx_fjsp_set1_v2r8(fr->ic->k_rf*2.0);
+    crf              = gmx_fjsp_set1_v2r8(fr->ic->c_rf);
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+
+    vftab            = kernel_data->table_vdw->data;
+    vftabscale       = gmx_fjsp_set1_v2r8(kernel_data->table_vdw->scale);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_fjsp_load_shift_and_1rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,&ix0,&iy0,&iz0);
+
+        fix0             = _fjsp_setzero_v2r8();
+        fiy0             = _fjsp_setzero_v2r8();
+        fiz0             = _fjsp_setzero_v2r8();
+
+        /* Load parameters for i particles */
+        iq0              = _fjsp_mul_v2r8(facel,gmx_fjsp_load1_v2r8(charge+inr+0));
+        vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_fjsp_load_2real_swizzle_v2r8(charge+jnrA+0,charge+jnrB+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _fjsp_mul_v2r8(iq0,jq0);
+            gmx_fjsp_load_2pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,&c6_00,&c12_00);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r00,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 8;
+            vfconv.i[1]     *= 8;
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq00,_fjsp_msub_v2r8(rinv00,rinvsq00,krf2));
+
+            /* CUBIC SPLINE TABLE DISPERSION */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 2 );
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 2 );
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+            FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+            fvdw6            = _fjsp_mul_v2r8(c6_00,FF);
+
+            /* CUBIC SPLINE TABLE REPULSION */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 4 );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 4 );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 6 );
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 6 );
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+            FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+            fvdw12           = _fjsp_mul_v2r8(c12_00,FF);
+            fvdw             = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_add_v2r8(fvdw6,fvdw12),_fjsp_mul_v2r8(vftabscale,rinv00)));
+
+            fscal            = _fjsp_add_v2r8(felec,fvdw);
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            gmx_fjsp_decrement_fma_1rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fscal,dx00,dy00,dz00);
+
+            /* Inner loop uses 57 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+
+            /* Load parameters for j particles */
+            jq0              = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),charge+jnrA+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _fjsp_mul_v2r8(iq0,jq0);
+            gmx_fjsp_load_1pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,&c6_00,&c12_00);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r00,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 8;
+            vfconv.i[1]     *= 8;
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq00,_fjsp_msub_v2r8(rinv00,rinvsq00,krf2));
+
+            /* CUBIC SPLINE TABLE DISPERSION */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 2 );
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+            FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+            fvdw6            = _fjsp_mul_v2r8(c6_00,FF);
+
+            /* CUBIC SPLINE TABLE REPULSION */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 4 );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 6 );
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+            FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+            fvdw12           = _fjsp_mul_v2r8(c12_00,FF);
+            fvdw             = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_add_v2r8(fvdw6,fvdw12),_fjsp_mul_v2r8(vftabscale,rinv00)));
+
+            fscal            = _fjsp_add_v2r8(felec,fvdw);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            gmx_fjsp_decrement_fma_1rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fscal,dx00,dy00,dz00);
+
+            /* Inner loop uses 57 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_fjsp_update_iforce_1atom_swizzle_v2r8(fix0,fiy0,fiz0,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 7 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_F,outeriter*7 + inneriter*57);
+}
diff --git a/src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecRF_VdwCSTab_GeomW3P1_sparc64_hpc_ace_double.c b/src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecRF_VdwCSTab_GeomW3P1_sparc64_hpc_ace_double.c
new file mode 100644 (file)
index 0000000..681fc26
--- /dev/null
@@ -0,0 +1,989 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2012, by the GROMACS development team, led by
+ * David van der Spoel, Berk Hess, Erik Lindahl, and including many
+ * others, as listed in the AUTHORS file in the top-level source
+ * directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*
+ * Note: this file was generated by the GROMACS sparc64_hpc_ace_double kernel generator.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "kernelutil_sparc64_hpc_ace_double.h"
+
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecRF_VdwCSTab_GeomW3P1_VF_sparc64_hpc_ace_double
+ * Electrostatics interaction: ReactionField
+ * VdW interaction:            CubicSplineTable
+ * Geometry:                   Water3-Particle
+ * Calculate force/pot:        PotentialAndForce
+ */
+void
+nb_kernel_ElecRF_VdwCSTab_GeomW3P1_VF_sparc64_hpc_ace_double
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwjidx0A,vdwjidx0B;
+    _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    _fjsp_v2r8       dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    _fjsp_v2r8       dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
+    _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
+    _fjsp_v2r8       rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF,twovfeps;
+    real             *vftab;
+    _fjsp_v2r8       itab_tmp;
+    _fjsp_v2r8       dummy_mask,cutoff_mask;
+    _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+    _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+    union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    krf              = gmx_fjsp_set1_v2r8(fr->ic->k_rf);
+    krf2             = gmx_fjsp_set1_v2r8(fr->ic->k_rf*2.0);
+    crf              = gmx_fjsp_set1_v2r8(fr->ic->c_rf);
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+
+    vftab            = kernel_data->table_vdw->data;
+    vftabscale       = gmx_fjsp_set1_v2r8(kernel_data->table_vdw->scale);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq0              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+0]));
+    iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+    iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+    vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_fjsp_load_shift_and_3rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
+
+        fix0             = _fjsp_setzero_v2r8();
+        fiy0             = _fjsp_setzero_v2r8();
+        fiz0             = _fjsp_setzero_v2r8();
+        fix1             = _fjsp_setzero_v2r8();
+        fiy1             = _fjsp_setzero_v2r8();
+        fiz1             = _fjsp_setzero_v2r8();
+        fix2             = _fjsp_setzero_v2r8();
+        fiy2             = _fjsp_setzero_v2r8();
+        fiz2             = _fjsp_setzero_v2r8();
+
+        /* Reset potential sums */
+        velecsum         = _fjsp_setzero_v2r8();
+        vvdwsum          = _fjsp_setzero_v2r8();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+            rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+            rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_fjsp_load_2real_swizzle_v2r8(charge+jnrA+0,charge+jnrB+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _fjsp_mul_v2r8(iq0,jq0);
+            gmx_fjsp_load_2pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,&c6_00,&c12_00);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r00,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 8;
+            vfconv.i[1]     *= 8;
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq00,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq00,rinv00),crf));
+            felec            = _fjsp_mul_v2r8(qq00,_fjsp_msub_v2r8(rinv00,rinvsq00,krf2));
+
+            /* CUBIC SPLINE TABLE DISPERSION */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 2 );
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 2 );
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            vvdw6            = _fjsp_mul_v2r8(c6_00,VV);
+            FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+            fvdw6            = _fjsp_mul_v2r8(c6_00,FF);
+
+            /* CUBIC SPLINE TABLE REPULSION */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 4 );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 4 );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 6 );
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 6 );
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            vvdw12           = _fjsp_mul_v2r8(c12_00,VV);
+            FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+            fvdw12           = _fjsp_mul_v2r8(c12_00,FF);
+            vvdw             = _fjsp_add_v2r8(vvdw12,vvdw6);
+            fvdw             = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_add_v2r8(fvdw6,fvdw12),_fjsp_mul_v2r8(vftabscale,rinv00)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+            vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
+
+            fscal            = _fjsp_add_v2r8(felec,fvdw);
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _fjsp_mul_v2r8(iq1,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq10,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq10,rinv10),crf));
+            felec            = _fjsp_mul_v2r8(qq10,_fjsp_msub_v2r8(rinv10,rinvsq10,krf2));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _fjsp_mul_v2r8(iq2,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq20,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq20,rinv20),crf));
+            felec            = _fjsp_mul_v2r8(qq20,_fjsp_msub_v2r8(rinv20,rinvsq20,krf2));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            gmx_fjsp_decrement_1rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 143 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+            rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+            rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+
+            /* Load parameters for j particles */
+            jq0              = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),charge+jnrA+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _fjsp_mul_v2r8(iq0,jq0);
+            gmx_fjsp_load_1pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,&c6_00,&c12_00);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r00,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 8;
+            vfconv.i[1]     *= 8;
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq00,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq00,rinv00),crf));
+            felec            = _fjsp_mul_v2r8(qq00,_fjsp_msub_v2r8(rinv00,rinvsq00,krf2));
+
+            /* CUBIC SPLINE TABLE DISPERSION */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 2 );
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            vvdw6            = _fjsp_mul_v2r8(c6_00,VV);
+            FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+            fvdw6            = _fjsp_mul_v2r8(c6_00,FF);
+
+            /* CUBIC SPLINE TABLE REPULSION */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 4 );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 6 );
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            vvdw12           = _fjsp_mul_v2r8(c12_00,VV);
+            FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+            fvdw12           = _fjsp_mul_v2r8(c12_00,FF);
+            vvdw             = _fjsp_add_v2r8(vvdw12,vvdw6);
+            fvdw             = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_add_v2r8(fvdw6,fvdw12),_fjsp_mul_v2r8(vftabscale,rinv00)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+            vvdw             = _fjsp_unpacklo_v2r8(vvdw,_fjsp_setzero_v2r8());
+            vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
+
+            fscal            = _fjsp_add_v2r8(felec,fvdw);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _fjsp_mul_v2r8(iq1,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq10,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq10,rinv10),crf));
+            felec            = _fjsp_mul_v2r8(qq10,_fjsp_msub_v2r8(rinv10,rinvsq10,krf2));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _fjsp_mul_v2r8(iq2,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq20,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq20,rinv20),crf));
+            felec            = _fjsp_mul_v2r8(qq20,_fjsp_msub_v2r8(rinv20,rinvsq20,krf2));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            gmx_fjsp_decrement_1rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 143 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_fjsp_update_iforce_3atom_swizzle_v2r8(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        ggid                        = gid[iidx];
+        /* Update potential energies */
+        gmx_fjsp_update_1pot_v2r8(velecsum,kernel_data->energygrp_elec+ggid);
+        gmx_fjsp_update_1pot_v2r8(vvdwsum,kernel_data->energygrp_vdw+ggid);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 20 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3_VF,outeriter*20 + inneriter*143);
+}
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecRF_VdwCSTab_GeomW3P1_F_sparc64_hpc_ace_double
+ * Electrostatics interaction: ReactionField
+ * VdW interaction:            CubicSplineTable
+ * Geometry:                   Water3-Particle
+ * Calculate force/pot:        Force
+ */
+void
+nb_kernel_ElecRF_VdwCSTab_GeomW3P1_F_sparc64_hpc_ace_double
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwjidx0A,vdwjidx0B;
+    _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    _fjsp_v2r8       dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    _fjsp_v2r8       dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
+    _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
+    _fjsp_v2r8       rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF,twovfeps;
+    real             *vftab;
+    _fjsp_v2r8       itab_tmp;
+    _fjsp_v2r8       dummy_mask,cutoff_mask;
+    _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+    _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+    union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    krf              = gmx_fjsp_set1_v2r8(fr->ic->k_rf);
+    krf2             = gmx_fjsp_set1_v2r8(fr->ic->k_rf*2.0);
+    crf              = gmx_fjsp_set1_v2r8(fr->ic->c_rf);
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+
+    vftab            = kernel_data->table_vdw->data;
+    vftabscale       = gmx_fjsp_set1_v2r8(kernel_data->table_vdw->scale);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq0              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+0]));
+    iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+    iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+    vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_fjsp_load_shift_and_3rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
+
+        fix0             = _fjsp_setzero_v2r8();
+        fiy0             = _fjsp_setzero_v2r8();
+        fiz0             = _fjsp_setzero_v2r8();
+        fix1             = _fjsp_setzero_v2r8();
+        fiy1             = _fjsp_setzero_v2r8();
+        fiz1             = _fjsp_setzero_v2r8();
+        fix2             = _fjsp_setzero_v2r8();
+        fiy2             = _fjsp_setzero_v2r8();
+        fiz2             = _fjsp_setzero_v2r8();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+            rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+            rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_fjsp_load_2real_swizzle_v2r8(charge+jnrA+0,charge+jnrB+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _fjsp_mul_v2r8(iq0,jq0);
+            gmx_fjsp_load_2pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,&c6_00,&c12_00);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r00,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 8;
+            vfconv.i[1]     *= 8;
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq00,_fjsp_msub_v2r8(rinv00,rinvsq00,krf2));
+
+            /* CUBIC SPLINE TABLE DISPERSION */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 2 );
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 2 );
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+            FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+            fvdw6            = _fjsp_mul_v2r8(c6_00,FF);
+
+            /* CUBIC SPLINE TABLE REPULSION */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 4 );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 4 );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 6 );
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 6 );
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+            FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+            fvdw12           = _fjsp_mul_v2r8(c12_00,FF);
+            fvdw             = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_add_v2r8(fvdw6,fvdw12),_fjsp_mul_v2r8(vftabscale,rinv00)));
+
+            fscal            = _fjsp_add_v2r8(felec,fvdw);
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _fjsp_mul_v2r8(iq1,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq10,_fjsp_msub_v2r8(rinv10,rinvsq10,krf2));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _fjsp_mul_v2r8(iq2,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq20,_fjsp_msub_v2r8(rinv20,rinvsq20,krf2));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            gmx_fjsp_decrement_1rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 120 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+            rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+            rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+
+            /* Load parameters for j particles */
+            jq0              = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),charge+jnrA+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _fjsp_mul_v2r8(iq0,jq0);
+            gmx_fjsp_load_1pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,&c6_00,&c12_00);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r00,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 8;
+            vfconv.i[1]     *= 8;
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq00,_fjsp_msub_v2r8(rinv00,rinvsq00,krf2));
+
+            /* CUBIC SPLINE TABLE DISPERSION */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 2 );
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+            FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+            fvdw6            = _fjsp_mul_v2r8(c6_00,FF);
+
+            /* CUBIC SPLINE TABLE REPULSION */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 4 );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 6 );
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+            FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+            fvdw12           = _fjsp_mul_v2r8(c12_00,FF);
+            fvdw             = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_add_v2r8(fvdw6,fvdw12),_fjsp_mul_v2r8(vftabscale,rinv00)));
+
+            fscal            = _fjsp_add_v2r8(felec,fvdw);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _fjsp_mul_v2r8(iq1,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq10,_fjsp_msub_v2r8(rinv10,rinvsq10,krf2));
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _fjsp_mul_v2r8(iq2,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq20,_fjsp_msub_v2r8(rinv20,rinvsq20,krf2));
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            gmx_fjsp_decrement_1rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 120 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_fjsp_update_iforce_3atom_swizzle_v2r8(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 18 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3_F,outeriter*18 + inneriter*120);
+}
diff --git a/src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecRF_VdwCSTab_GeomW3W3_sparc64_hpc_ace_double.c b/src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecRF_VdwCSTab_GeomW3W3_sparc64_hpc_ace_double.c
new file mode 100644 (file)
index 0000000..b314416
--- /dev/null
@@ -0,0 +1,1659 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2012, by the GROMACS development team, led by
+ * David van der Spoel, Berk Hess, Erik Lindahl, and including many
+ * others, as listed in the AUTHORS file in the top-level source
+ * directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*
+ * Note: this file was generated by the GROMACS sparc64_hpc_ace_double kernel generator.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "kernelutil_sparc64_hpc_ace_double.h"
+
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecRF_VdwCSTab_GeomW3W3_VF_sparc64_hpc_ace_double
+ * Electrostatics interaction: ReactionField
+ * VdW interaction:            CubicSplineTable
+ * Geometry:                   Water3-Water3
+ * Calculate force/pot:        PotentialAndForce
+ */
+void
+nb_kernel_ElecRF_VdwCSTab_GeomW3W3_VF_sparc64_hpc_ace_double
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwjidx0A,vdwjidx0B;
+    _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    int              vdwjidx1A,vdwjidx1B;
+    _fjsp_v2r8       jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+    int              vdwjidx2A,vdwjidx2B;
+    _fjsp_v2r8       jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+    _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    _fjsp_v2r8       dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
+    _fjsp_v2r8       dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
+    _fjsp_v2r8       dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    _fjsp_v2r8       dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+    _fjsp_v2r8       dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+    _fjsp_v2r8       dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    _fjsp_v2r8       dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+    _fjsp_v2r8       dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+    _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
+    _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
+    _fjsp_v2r8       rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF,twovfeps;
+    real             *vftab;
+    _fjsp_v2r8       itab_tmp;
+    _fjsp_v2r8       dummy_mask,cutoff_mask;
+    _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+    _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+    union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    krf              = gmx_fjsp_set1_v2r8(fr->ic->k_rf);
+    krf2             = gmx_fjsp_set1_v2r8(fr->ic->k_rf*2.0);
+    crf              = gmx_fjsp_set1_v2r8(fr->ic->c_rf);
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+
+    vftab            = kernel_data->table_vdw->data;
+    vftabscale       = gmx_fjsp_set1_v2r8(kernel_data->table_vdw->scale);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq0              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+0]));
+    iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+    iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+    vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+    jq0              = gmx_fjsp_set1_v2r8(charge[inr+0]);
+    jq1              = gmx_fjsp_set1_v2r8(charge[inr+1]);
+    jq2              = gmx_fjsp_set1_v2r8(charge[inr+2]);
+    vdwjidx0A        = 2*vdwtype[inr+0];
+    qq00             = _fjsp_mul_v2r8(iq0,jq0);
+    c6_00            = gmx_fjsp_set1_v2r8(vdwparam[vdwioffset0+vdwjidx0A]);
+    c12_00           = gmx_fjsp_set1_v2r8(vdwparam[vdwioffset0+vdwjidx0A+1]);
+    qq01             = _fjsp_mul_v2r8(iq0,jq1);
+    qq02             = _fjsp_mul_v2r8(iq0,jq2);
+    qq10             = _fjsp_mul_v2r8(iq1,jq0);
+    qq11             = _fjsp_mul_v2r8(iq1,jq1);
+    qq12             = _fjsp_mul_v2r8(iq1,jq2);
+    qq20             = _fjsp_mul_v2r8(iq2,jq0);
+    qq21             = _fjsp_mul_v2r8(iq2,jq1);
+    qq22             = _fjsp_mul_v2r8(iq2,jq2);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_fjsp_load_shift_and_3rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
+
+        fix0             = _fjsp_setzero_v2r8();
+        fiy0             = _fjsp_setzero_v2r8();
+        fiz0             = _fjsp_setzero_v2r8();
+        fix1             = _fjsp_setzero_v2r8();
+        fiy1             = _fjsp_setzero_v2r8();
+        fiz1             = _fjsp_setzero_v2r8();
+        fix2             = _fjsp_setzero_v2r8();
+        fiy2             = _fjsp_setzero_v2r8();
+        fiz2             = _fjsp_setzero_v2r8();
+
+        /* Reset potential sums */
+        velecsum         = _fjsp_setzero_v2r8();
+        vvdwsum          = _fjsp_setzero_v2r8();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_3rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx01             = _fjsp_sub_v2r8(ix0,jx1);
+            dy01             = _fjsp_sub_v2r8(iy0,jy1);
+            dz01             = _fjsp_sub_v2r8(iz0,jz1);
+            dx02             = _fjsp_sub_v2r8(ix0,jx2);
+            dy02             = _fjsp_sub_v2r8(iy0,jy2);
+            dz02             = _fjsp_sub_v2r8(iz0,jz2);
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx11             = _fjsp_sub_v2r8(ix1,jx1);
+            dy11             = _fjsp_sub_v2r8(iy1,jy1);
+            dz11             = _fjsp_sub_v2r8(iz1,jz1);
+            dx12             = _fjsp_sub_v2r8(ix1,jx2);
+            dy12             = _fjsp_sub_v2r8(iy1,jy2);
+            dz12             = _fjsp_sub_v2r8(iz1,jz2);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+            dx21             = _fjsp_sub_v2r8(ix2,jx1);
+            dy21             = _fjsp_sub_v2r8(iy2,jy1);
+            dz21             = _fjsp_sub_v2r8(iz2,jz1);
+            dx22             = _fjsp_sub_v2r8(ix2,jx2);
+            dy22             = _fjsp_sub_v2r8(iy2,jy2);
+            dz22             = _fjsp_sub_v2r8(iz2,jz2);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq01            = gmx_fjsp_calc_rsq_v2r8(dx01,dy01,dz01);
+            rsq02            = gmx_fjsp_calc_rsq_v2r8(dx02,dy02,dz02);
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+            rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+            rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+            rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv01           = gmx_fjsp_invsqrt_v2r8(rsq01);
+            rinv02           = gmx_fjsp_invsqrt_v2r8(rsq02);
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+            rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+            rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+            rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+            rinvsq01         = _fjsp_mul_v2r8(rinv01,rinv01);
+            rinvsq02         = _fjsp_mul_v2r8(rinv02,rinv02);
+            rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+            rinvsq11         = _fjsp_mul_v2r8(rinv11,rinv11);
+            rinvsq12         = _fjsp_mul_v2r8(rinv12,rinv12);
+            rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+            rinvsq21         = _fjsp_mul_v2r8(rinv21,rinv21);
+            rinvsq22         = _fjsp_mul_v2r8(rinv22,rinv22);
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+            fjx1             = _fjsp_setzero_v2r8();
+            fjy1             = _fjsp_setzero_v2r8();
+            fjz1             = _fjsp_setzero_v2r8();
+            fjx2             = _fjsp_setzero_v2r8();
+            fjy2             = _fjsp_setzero_v2r8();
+            fjz2             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r00,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 8;
+            vfconv.i[1]     *= 8;
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq00,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq00,rinv00),crf));
+            felec            = _fjsp_mul_v2r8(qq00,_fjsp_msub_v2r8(rinv00,rinvsq00,krf2));
+
+            /* CUBIC SPLINE TABLE DISPERSION */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 2 );
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 2 );
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            vvdw6            = _fjsp_mul_v2r8(c6_00,VV);
+            FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+            fvdw6            = _fjsp_mul_v2r8(c6_00,FF);
+
+            /* CUBIC SPLINE TABLE REPULSION */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 4 );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 4 );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 6 );
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 6 );
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            vvdw12           = _fjsp_mul_v2r8(c12_00,VV);
+            FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+            fvdw12           = _fjsp_mul_v2r8(c12_00,FF);
+            vvdw             = _fjsp_add_v2r8(vvdw12,vvdw6);
+            fvdw             = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_add_v2r8(fvdw6,fvdw12),_fjsp_mul_v2r8(vftabscale,rinv00)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+            vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
+
+            fscal            = _fjsp_add_v2r8(felec,fvdw);
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq01,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq01,rinv01),crf));
+            felec            = _fjsp_mul_v2r8(qq01,_fjsp_msub_v2r8(rinv01,rinvsq01,krf2));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx01,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy01,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz01,fscal,fiz0);
+            
+            fjx1             = _fjsp_madd_v2r8(dx01,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy01,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz01,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq02,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq02,rinv02),crf));
+            felec            = _fjsp_mul_v2r8(qq02,_fjsp_msub_v2r8(rinv02,rinvsq02,krf2));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx02,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy02,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz02,fscal,fiz0);
+            
+            fjx2             = _fjsp_madd_v2r8(dx02,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy02,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz02,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq10,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq10,rinv10),crf));
+            felec            = _fjsp_mul_v2r8(qq10,_fjsp_msub_v2r8(rinv10,rinvsq10,krf2));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq11,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq11,rinv11),crf));
+            felec            = _fjsp_mul_v2r8(qq11,_fjsp_msub_v2r8(rinv11,rinvsq11,krf2));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+            
+            fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq12,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq12,rinv12),crf));
+            felec            = _fjsp_mul_v2r8(qq12,_fjsp_msub_v2r8(rinv12,rinvsq12,krf2));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+            
+            fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq20,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq20,rinv20),crf));
+            felec            = _fjsp_mul_v2r8(qq20,_fjsp_msub_v2r8(rinv20,rinvsq20,krf2));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq21,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq21,rinv21),crf));
+            felec            = _fjsp_mul_v2r8(qq21,_fjsp_msub_v2r8(rinv21,rinvsq21,krf2));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+            
+            fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq22,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq22,rinv22),crf));
+            felec            = _fjsp_mul_v2r8(qq22,_fjsp_msub_v2r8(rinv22,rinvsq22,krf2));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+            
+            fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+
+            gmx_fjsp_decrement_3rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+
+            /* Inner loop uses 350 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_3rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx01             = _fjsp_sub_v2r8(ix0,jx1);
+            dy01             = _fjsp_sub_v2r8(iy0,jy1);
+            dz01             = _fjsp_sub_v2r8(iz0,jz1);
+            dx02             = _fjsp_sub_v2r8(ix0,jx2);
+            dy02             = _fjsp_sub_v2r8(iy0,jy2);
+            dz02             = _fjsp_sub_v2r8(iz0,jz2);
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx11             = _fjsp_sub_v2r8(ix1,jx1);
+            dy11             = _fjsp_sub_v2r8(iy1,jy1);
+            dz11             = _fjsp_sub_v2r8(iz1,jz1);
+            dx12             = _fjsp_sub_v2r8(ix1,jx2);
+            dy12             = _fjsp_sub_v2r8(iy1,jy2);
+            dz12             = _fjsp_sub_v2r8(iz1,jz2);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+            dx21             = _fjsp_sub_v2r8(ix2,jx1);
+            dy21             = _fjsp_sub_v2r8(iy2,jy1);
+            dz21             = _fjsp_sub_v2r8(iz2,jz1);
+            dx22             = _fjsp_sub_v2r8(ix2,jx2);
+            dy22             = _fjsp_sub_v2r8(iy2,jy2);
+            dz22             = _fjsp_sub_v2r8(iz2,jz2);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq01            = gmx_fjsp_calc_rsq_v2r8(dx01,dy01,dz01);
+            rsq02            = gmx_fjsp_calc_rsq_v2r8(dx02,dy02,dz02);
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+            rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+            rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+            rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv01           = gmx_fjsp_invsqrt_v2r8(rsq01);
+            rinv02           = gmx_fjsp_invsqrt_v2r8(rsq02);
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+            rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+            rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+            rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+            rinvsq01         = _fjsp_mul_v2r8(rinv01,rinv01);
+            rinvsq02         = _fjsp_mul_v2r8(rinv02,rinv02);
+            rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+            rinvsq11         = _fjsp_mul_v2r8(rinv11,rinv11);
+            rinvsq12         = _fjsp_mul_v2r8(rinv12,rinv12);
+            rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+            rinvsq21         = _fjsp_mul_v2r8(rinv21,rinv21);
+            rinvsq22         = _fjsp_mul_v2r8(rinv22,rinv22);
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+            fjx1             = _fjsp_setzero_v2r8();
+            fjy1             = _fjsp_setzero_v2r8();
+            fjz1             = _fjsp_setzero_v2r8();
+            fjx2             = _fjsp_setzero_v2r8();
+            fjy2             = _fjsp_setzero_v2r8();
+            fjz2             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r00,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 8;
+            vfconv.i[1]     *= 8;
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq00,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq00,rinv00),crf));
+            felec            = _fjsp_mul_v2r8(qq00,_fjsp_msub_v2r8(rinv00,rinvsq00,krf2));
+
+            /* CUBIC SPLINE TABLE DISPERSION */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 2 );
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            vvdw6            = _fjsp_mul_v2r8(c6_00,VV);
+            FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+            fvdw6            = _fjsp_mul_v2r8(c6_00,FF);
+
+            /* CUBIC SPLINE TABLE REPULSION */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 4 );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 6 );
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            vvdw12           = _fjsp_mul_v2r8(c12_00,VV);
+            FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+            fvdw12           = _fjsp_mul_v2r8(c12_00,FF);
+            vvdw             = _fjsp_add_v2r8(vvdw12,vvdw6);
+            fvdw             = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_add_v2r8(fvdw6,fvdw12),_fjsp_mul_v2r8(vftabscale,rinv00)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+            vvdw             = _fjsp_unpacklo_v2r8(vvdw,_fjsp_setzero_v2r8());
+            vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
+
+            fscal            = _fjsp_add_v2r8(felec,fvdw);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq01,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq01,rinv01),crf));
+            felec            = _fjsp_mul_v2r8(qq01,_fjsp_msub_v2r8(rinv01,rinvsq01,krf2));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx01,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy01,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz01,fscal,fiz0);
+            
+            fjx1             = _fjsp_madd_v2r8(dx01,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy01,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz01,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq02,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq02,rinv02),crf));
+            felec            = _fjsp_mul_v2r8(qq02,_fjsp_msub_v2r8(rinv02,rinvsq02,krf2));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx02,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy02,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz02,fscal,fiz0);
+            
+            fjx2             = _fjsp_madd_v2r8(dx02,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy02,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz02,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq10,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq10,rinv10),crf));
+            felec            = _fjsp_mul_v2r8(qq10,_fjsp_msub_v2r8(rinv10,rinvsq10,krf2));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq11,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq11,rinv11),crf));
+            felec            = _fjsp_mul_v2r8(qq11,_fjsp_msub_v2r8(rinv11,rinvsq11,krf2));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+            
+            fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq12,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq12,rinv12),crf));
+            felec            = _fjsp_mul_v2r8(qq12,_fjsp_msub_v2r8(rinv12,rinvsq12,krf2));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+            
+            fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq20,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq20,rinv20),crf));
+            felec            = _fjsp_mul_v2r8(qq20,_fjsp_msub_v2r8(rinv20,rinvsq20,krf2));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq21,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq21,rinv21),crf));
+            felec            = _fjsp_mul_v2r8(qq21,_fjsp_msub_v2r8(rinv21,rinvsq21,krf2));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+            
+            fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq22,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq22,rinv22),crf));
+            felec            = _fjsp_mul_v2r8(qq22,_fjsp_msub_v2r8(rinv22,rinvsq22,krf2));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+            
+            fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+
+            gmx_fjsp_decrement_3rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+
+            /* Inner loop uses 350 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_fjsp_update_iforce_3atom_swizzle_v2r8(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        ggid                        = gid[iidx];
+        /* Update potential energies */
+        gmx_fjsp_update_1pot_v2r8(velecsum,kernel_data->energygrp_elec+ggid);
+        gmx_fjsp_update_1pot_v2r8(vvdwsum,kernel_data->energygrp_vdw+ggid);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 20 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_VF,outeriter*20 + inneriter*350);
+}
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecRF_VdwCSTab_GeomW3W3_F_sparc64_hpc_ace_double
+ * Electrostatics interaction: ReactionField
+ * VdW interaction:            CubicSplineTable
+ * Geometry:                   Water3-Water3
+ * Calculate force/pot:        Force
+ */
+void
+nb_kernel_ElecRF_VdwCSTab_GeomW3W3_F_sparc64_hpc_ace_double
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwjidx0A,vdwjidx0B;
+    _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    int              vdwjidx1A,vdwjidx1B;
+    _fjsp_v2r8       jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+    int              vdwjidx2A,vdwjidx2B;
+    _fjsp_v2r8       jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+    _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    _fjsp_v2r8       dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
+    _fjsp_v2r8       dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
+    _fjsp_v2r8       dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    _fjsp_v2r8       dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+    _fjsp_v2r8       dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+    _fjsp_v2r8       dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    _fjsp_v2r8       dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+    _fjsp_v2r8       dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+    _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
+    _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
+    _fjsp_v2r8       rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF,twovfeps;
+    real             *vftab;
+    _fjsp_v2r8       itab_tmp;
+    _fjsp_v2r8       dummy_mask,cutoff_mask;
+    _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+    _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+    union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    krf              = gmx_fjsp_set1_v2r8(fr->ic->k_rf);
+    krf2             = gmx_fjsp_set1_v2r8(fr->ic->k_rf*2.0);
+    crf              = gmx_fjsp_set1_v2r8(fr->ic->c_rf);
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+
+    vftab            = kernel_data->table_vdw->data;
+    vftabscale       = gmx_fjsp_set1_v2r8(kernel_data->table_vdw->scale);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq0              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+0]));
+    iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+    iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+    vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+    jq0              = gmx_fjsp_set1_v2r8(charge[inr+0]);
+    jq1              = gmx_fjsp_set1_v2r8(charge[inr+1]);
+    jq2              = gmx_fjsp_set1_v2r8(charge[inr+2]);
+    vdwjidx0A        = 2*vdwtype[inr+0];
+    qq00             = _fjsp_mul_v2r8(iq0,jq0);
+    c6_00            = gmx_fjsp_set1_v2r8(vdwparam[vdwioffset0+vdwjidx0A]);
+    c12_00           = gmx_fjsp_set1_v2r8(vdwparam[vdwioffset0+vdwjidx0A+1]);
+    qq01             = _fjsp_mul_v2r8(iq0,jq1);
+    qq02             = _fjsp_mul_v2r8(iq0,jq2);
+    qq10             = _fjsp_mul_v2r8(iq1,jq0);
+    qq11             = _fjsp_mul_v2r8(iq1,jq1);
+    qq12             = _fjsp_mul_v2r8(iq1,jq2);
+    qq20             = _fjsp_mul_v2r8(iq2,jq0);
+    qq21             = _fjsp_mul_v2r8(iq2,jq1);
+    qq22             = _fjsp_mul_v2r8(iq2,jq2);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_fjsp_load_shift_and_3rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
+
+        fix0             = _fjsp_setzero_v2r8();
+        fiy0             = _fjsp_setzero_v2r8();
+        fiz0             = _fjsp_setzero_v2r8();
+        fix1             = _fjsp_setzero_v2r8();
+        fiy1             = _fjsp_setzero_v2r8();
+        fiz1             = _fjsp_setzero_v2r8();
+        fix2             = _fjsp_setzero_v2r8();
+        fiy2             = _fjsp_setzero_v2r8();
+        fiz2             = _fjsp_setzero_v2r8();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_3rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx01             = _fjsp_sub_v2r8(ix0,jx1);
+            dy01             = _fjsp_sub_v2r8(iy0,jy1);
+            dz01             = _fjsp_sub_v2r8(iz0,jz1);
+            dx02             = _fjsp_sub_v2r8(ix0,jx2);
+            dy02             = _fjsp_sub_v2r8(iy0,jy2);
+            dz02             = _fjsp_sub_v2r8(iz0,jz2);
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx11             = _fjsp_sub_v2r8(ix1,jx1);
+            dy11             = _fjsp_sub_v2r8(iy1,jy1);
+            dz11             = _fjsp_sub_v2r8(iz1,jz1);
+            dx12             = _fjsp_sub_v2r8(ix1,jx2);
+            dy12             = _fjsp_sub_v2r8(iy1,jy2);
+            dz12             = _fjsp_sub_v2r8(iz1,jz2);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+            dx21             = _fjsp_sub_v2r8(ix2,jx1);
+            dy21             = _fjsp_sub_v2r8(iy2,jy1);
+            dz21             = _fjsp_sub_v2r8(iz2,jz1);
+            dx22             = _fjsp_sub_v2r8(ix2,jx2);
+            dy22             = _fjsp_sub_v2r8(iy2,jy2);
+            dz22             = _fjsp_sub_v2r8(iz2,jz2);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq01            = gmx_fjsp_calc_rsq_v2r8(dx01,dy01,dz01);
+            rsq02            = gmx_fjsp_calc_rsq_v2r8(dx02,dy02,dz02);
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+            rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+            rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+            rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv01           = gmx_fjsp_invsqrt_v2r8(rsq01);
+            rinv02           = gmx_fjsp_invsqrt_v2r8(rsq02);
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+            rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+            rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+            rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+            rinvsq01         = _fjsp_mul_v2r8(rinv01,rinv01);
+            rinvsq02         = _fjsp_mul_v2r8(rinv02,rinv02);
+            rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+            rinvsq11         = _fjsp_mul_v2r8(rinv11,rinv11);
+            rinvsq12         = _fjsp_mul_v2r8(rinv12,rinv12);
+            rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+            rinvsq21         = _fjsp_mul_v2r8(rinv21,rinv21);
+            rinvsq22         = _fjsp_mul_v2r8(rinv22,rinv22);
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+            fjx1             = _fjsp_setzero_v2r8();
+            fjy1             = _fjsp_setzero_v2r8();
+            fjz1             = _fjsp_setzero_v2r8();
+            fjx2             = _fjsp_setzero_v2r8();
+            fjy2             = _fjsp_setzero_v2r8();
+            fjz2             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r00,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 8;
+            vfconv.i[1]     *= 8;
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq00,_fjsp_msub_v2r8(rinv00,rinvsq00,krf2));
+
+            /* CUBIC SPLINE TABLE DISPERSION */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 2 );
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 2 );
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+            FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+            fvdw6            = _fjsp_mul_v2r8(c6_00,FF);
+
+            /* CUBIC SPLINE TABLE REPULSION */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 4 );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 4 );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 6 );
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 6 );
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+            FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+            fvdw12           = _fjsp_mul_v2r8(c12_00,FF);
+            fvdw             = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_add_v2r8(fvdw6,fvdw12),_fjsp_mul_v2r8(vftabscale,rinv00)));
+
+            fscal            = _fjsp_add_v2r8(felec,fvdw);
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq01,_fjsp_msub_v2r8(rinv01,rinvsq01,krf2));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx01,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy01,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz01,fscal,fiz0);
+            
+            fjx1             = _fjsp_madd_v2r8(dx01,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy01,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz01,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq02,_fjsp_msub_v2r8(rinv02,rinvsq02,krf2));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx02,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy02,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz02,fscal,fiz0);
+            
+            fjx2             = _fjsp_madd_v2r8(dx02,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy02,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz02,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq10,_fjsp_msub_v2r8(rinv10,rinvsq10,krf2));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq11,_fjsp_msub_v2r8(rinv11,rinvsq11,krf2));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+            
+            fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq12,_fjsp_msub_v2r8(rinv12,rinvsq12,krf2));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+            
+            fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq20,_fjsp_msub_v2r8(rinv20,rinvsq20,krf2));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq21,_fjsp_msub_v2r8(rinv21,rinvsq21,krf2));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+            
+            fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq22,_fjsp_msub_v2r8(rinv22,rinvsq22,krf2));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+            
+            fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+
+            gmx_fjsp_decrement_3rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+
+            /* Inner loop uses 297 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_3rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx01             = _fjsp_sub_v2r8(ix0,jx1);
+            dy01             = _fjsp_sub_v2r8(iy0,jy1);
+            dz01             = _fjsp_sub_v2r8(iz0,jz1);
+            dx02             = _fjsp_sub_v2r8(ix0,jx2);
+            dy02             = _fjsp_sub_v2r8(iy0,jy2);
+            dz02             = _fjsp_sub_v2r8(iz0,jz2);
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx11             = _fjsp_sub_v2r8(ix1,jx1);
+            dy11             = _fjsp_sub_v2r8(iy1,jy1);
+            dz11             = _fjsp_sub_v2r8(iz1,jz1);
+            dx12             = _fjsp_sub_v2r8(ix1,jx2);
+            dy12             = _fjsp_sub_v2r8(iy1,jy2);
+            dz12             = _fjsp_sub_v2r8(iz1,jz2);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+            dx21             = _fjsp_sub_v2r8(ix2,jx1);
+            dy21             = _fjsp_sub_v2r8(iy2,jy1);
+            dz21             = _fjsp_sub_v2r8(iz2,jz1);
+            dx22             = _fjsp_sub_v2r8(ix2,jx2);
+            dy22             = _fjsp_sub_v2r8(iy2,jy2);
+            dz22             = _fjsp_sub_v2r8(iz2,jz2);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq01            = gmx_fjsp_calc_rsq_v2r8(dx01,dy01,dz01);
+            rsq02            = gmx_fjsp_calc_rsq_v2r8(dx02,dy02,dz02);
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+            rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+            rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+            rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv01           = gmx_fjsp_invsqrt_v2r8(rsq01);
+            rinv02           = gmx_fjsp_invsqrt_v2r8(rsq02);
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+            rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+            rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+            rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+            rinvsq01         = _fjsp_mul_v2r8(rinv01,rinv01);
+            rinvsq02         = _fjsp_mul_v2r8(rinv02,rinv02);
+            rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+            rinvsq11         = _fjsp_mul_v2r8(rinv11,rinv11);
+            rinvsq12         = _fjsp_mul_v2r8(rinv12,rinv12);
+            rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+            rinvsq21         = _fjsp_mul_v2r8(rinv21,rinv21);
+            rinvsq22         = _fjsp_mul_v2r8(rinv22,rinv22);
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+            fjx1             = _fjsp_setzero_v2r8();
+            fjy1             = _fjsp_setzero_v2r8();
+            fjz1             = _fjsp_setzero_v2r8();
+            fjx2             = _fjsp_setzero_v2r8();
+            fjy2             = _fjsp_setzero_v2r8();
+            fjz2             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r00,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 8;
+            vfconv.i[1]     *= 8;
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq00,_fjsp_msub_v2r8(rinv00,rinvsq00,krf2));
+
+            /* CUBIC SPLINE TABLE DISPERSION */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 2 );
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+            FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+            fvdw6            = _fjsp_mul_v2r8(c6_00,FF);
+
+            /* CUBIC SPLINE TABLE REPULSION */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 4 );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 6 );
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+            FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+            fvdw12           = _fjsp_mul_v2r8(c12_00,FF);
+            fvdw             = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_add_v2r8(fvdw6,fvdw12),_fjsp_mul_v2r8(vftabscale,rinv00)));
+
+            fscal            = _fjsp_add_v2r8(felec,fvdw);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq01,_fjsp_msub_v2r8(rinv01,rinvsq01,krf2));
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx01,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy01,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz01,fscal,fiz0);
+            
+            fjx1             = _fjsp_madd_v2r8(dx01,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy01,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz01,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq02,_fjsp_msub_v2r8(rinv02,rinvsq02,krf2));
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx02,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy02,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz02,fscal,fiz0);
+            
+            fjx2             = _fjsp_madd_v2r8(dx02,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy02,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz02,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq10,_fjsp_msub_v2r8(rinv10,rinvsq10,krf2));
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq11,_fjsp_msub_v2r8(rinv11,rinvsq11,krf2));
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+            
+            fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq12,_fjsp_msub_v2r8(rinv12,rinvsq12,krf2));
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+            
+            fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq20,_fjsp_msub_v2r8(rinv20,rinvsq20,krf2));
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq21,_fjsp_msub_v2r8(rinv21,rinvsq21,krf2));
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+            
+            fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq22,_fjsp_msub_v2r8(rinv22,rinvsq22,krf2));
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+            
+            fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+
+            gmx_fjsp_decrement_3rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+
+            /* Inner loop uses 297 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_fjsp_update_iforce_3atom_swizzle_v2r8(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 18 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_F,outeriter*18 + inneriter*297);
+}
diff --git a/src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecRF_VdwCSTab_GeomW4P1_sparc64_hpc_ace_double.c b/src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecRF_VdwCSTab_GeomW4P1_sparc64_hpc_ace_double.c
new file mode 100644 (file)
index 0000000..22d3111
--- /dev/null
@@ -0,0 +1,1097 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2012, by the GROMACS development team, led by
+ * David van der Spoel, Berk Hess, Erik Lindahl, and including many
+ * others, as listed in the AUTHORS file in the top-level source
+ * directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*
+ * Note: this file was generated by the GROMACS sparc64_hpc_ace_double kernel generator.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "kernelutil_sparc64_hpc_ace_double.h"
+
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecRF_VdwCSTab_GeomW4P1_VF_sparc64_hpc_ace_double
+ * Electrostatics interaction: ReactionField
+ * VdW interaction:            CubicSplineTable
+ * Geometry:                   Water4-Particle
+ * Calculate force/pot:        PotentialAndForce
+ */
+void
+nb_kernel_ElecRF_VdwCSTab_GeomW4P1_VF_sparc64_hpc_ace_double
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwioffset3;
+    _fjsp_v2r8       ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+    int              vdwjidx0A,vdwjidx0B;
+    _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    _fjsp_v2r8       dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    _fjsp_v2r8       dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    _fjsp_v2r8       dx30,dy30,dz30,rsq30,rinv30,rinvsq30,r30,qq30,c6_30,c12_30;
+    _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
+    _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
+    _fjsp_v2r8       rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF,twovfeps;
+    real             *vftab;
+    _fjsp_v2r8       itab_tmp;
+    _fjsp_v2r8       dummy_mask,cutoff_mask;
+    _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+    _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+    union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    krf              = gmx_fjsp_set1_v2r8(fr->ic->k_rf);
+    krf2             = gmx_fjsp_set1_v2r8(fr->ic->k_rf*2.0);
+    crf              = gmx_fjsp_set1_v2r8(fr->ic->c_rf);
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+
+    vftab            = kernel_data->table_vdw->data;
+    vftabscale       = gmx_fjsp_set1_v2r8(kernel_data->table_vdw->scale);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+    iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+    iq3              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+3]));
+    vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_fjsp_load_shift_and_4rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
+
+        fix0             = _fjsp_setzero_v2r8();
+        fiy0             = _fjsp_setzero_v2r8();
+        fiz0             = _fjsp_setzero_v2r8();
+        fix1             = _fjsp_setzero_v2r8();
+        fiy1             = _fjsp_setzero_v2r8();
+        fiz1             = _fjsp_setzero_v2r8();
+        fix2             = _fjsp_setzero_v2r8();
+        fiy2             = _fjsp_setzero_v2r8();
+        fiz2             = _fjsp_setzero_v2r8();
+        fix3             = _fjsp_setzero_v2r8();
+        fiy3             = _fjsp_setzero_v2r8();
+        fiz3             = _fjsp_setzero_v2r8();
+
+        /* Reset potential sums */
+        velecsum         = _fjsp_setzero_v2r8();
+        vvdwsum          = _fjsp_setzero_v2r8();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+            dx30             = _fjsp_sub_v2r8(ix3,jx0);
+            dy30             = _fjsp_sub_v2r8(iy3,jy0);
+            dz30             = _fjsp_sub_v2r8(iz3,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+            rsq30            = gmx_fjsp_calc_rsq_v2r8(dx30,dy30,dz30);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+            rinv30           = gmx_fjsp_invsqrt_v2r8(rsq30);
+
+            rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+            rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+            rinvsq30         = _fjsp_mul_v2r8(rinv30,rinv30);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_fjsp_load_2real_swizzle_v2r8(charge+jnrA+0,charge+jnrB+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            gmx_fjsp_load_2pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,&c6_00,&c12_00);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r00,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 8;
+            vfconv.i[1]     *= 8;
+
+            /* CUBIC SPLINE TABLE DISPERSION */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 2 );
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 2 );
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            vvdw6            = _fjsp_mul_v2r8(c6_00,VV);
+            FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+            fvdw6            = _fjsp_mul_v2r8(c6_00,FF);
+
+            /* CUBIC SPLINE TABLE REPULSION */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 4 );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 4 );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 6 );
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 6 );
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            vvdw12           = _fjsp_mul_v2r8(c12_00,VV);
+            FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+            fvdw12           = _fjsp_mul_v2r8(c12_00,FF);
+            vvdw             = _fjsp_add_v2r8(vvdw12,vvdw6);
+            fvdw             = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_add_v2r8(fvdw6,fvdw12),_fjsp_mul_v2r8(vftabscale,rinv00)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
+
+            fscal            = fvdw;
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _fjsp_mul_v2r8(iq1,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq10,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq10,rinv10),crf));
+            felec            = _fjsp_mul_v2r8(qq10,_fjsp_msub_v2r8(rinv10,rinvsq10,krf2));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _fjsp_mul_v2r8(iq2,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq20,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq20,rinv20),crf));
+            felec            = _fjsp_mul_v2r8(qq20,_fjsp_msub_v2r8(rinv20,rinvsq20,krf2));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq30             = _fjsp_mul_v2r8(iq3,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq30,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq30,rinv30),crf));
+            felec            = _fjsp_mul_v2r8(qq30,_fjsp_msub_v2r8(rinv30,rinvsq30,krf2));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx30,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy30,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz30,fscal,fiz3);
+            
+            fjx0             = _fjsp_madd_v2r8(dx30,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy30,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz30,fscal,fjz0);
+
+            gmx_fjsp_decrement_1rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 167 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+            dx30             = _fjsp_sub_v2r8(ix3,jx0);
+            dy30             = _fjsp_sub_v2r8(iy3,jy0);
+            dz30             = _fjsp_sub_v2r8(iz3,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+            rsq30            = gmx_fjsp_calc_rsq_v2r8(dx30,dy30,dz30);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+            rinv30           = gmx_fjsp_invsqrt_v2r8(rsq30);
+
+            rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+            rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+            rinvsq30         = _fjsp_mul_v2r8(rinv30,rinv30);
+
+            /* Load parameters for j particles */
+            jq0              = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),charge+jnrA+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            gmx_fjsp_load_1pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,&c6_00,&c12_00);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r00,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 8;
+            vfconv.i[1]     *= 8;
+
+            /* CUBIC SPLINE TABLE DISPERSION */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 2 );
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            vvdw6            = _fjsp_mul_v2r8(c6_00,VV);
+            FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+            fvdw6            = _fjsp_mul_v2r8(c6_00,FF);
+
+            /* CUBIC SPLINE TABLE REPULSION */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 4 );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 6 );
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            vvdw12           = _fjsp_mul_v2r8(c12_00,VV);
+            FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+            fvdw12           = _fjsp_mul_v2r8(c12_00,FF);
+            vvdw             = _fjsp_add_v2r8(vvdw12,vvdw6);
+            fvdw             = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_add_v2r8(fvdw6,fvdw12),_fjsp_mul_v2r8(vftabscale,rinv00)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            vvdw             = _fjsp_unpacklo_v2r8(vvdw,_fjsp_setzero_v2r8());
+            vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
+
+            fscal            = fvdw;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _fjsp_mul_v2r8(iq1,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq10,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq10,rinv10),crf));
+            felec            = _fjsp_mul_v2r8(qq10,_fjsp_msub_v2r8(rinv10,rinvsq10,krf2));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _fjsp_mul_v2r8(iq2,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq20,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq20,rinv20),crf));
+            felec            = _fjsp_mul_v2r8(qq20,_fjsp_msub_v2r8(rinv20,rinvsq20,krf2));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq30             = _fjsp_mul_v2r8(iq3,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq30,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq30,rinv30),crf));
+            felec            = _fjsp_mul_v2r8(qq30,_fjsp_msub_v2r8(rinv30,rinvsq30,krf2));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx30,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy30,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz30,fscal,fiz3);
+            
+            fjx0             = _fjsp_madd_v2r8(dx30,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy30,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz30,fscal,fjz0);
+
+            gmx_fjsp_decrement_1rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 167 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_fjsp_update_iforce_4atom_swizzle_v2r8(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        ggid                        = gid[iidx];
+        /* Update potential energies */
+        gmx_fjsp_update_1pot_v2r8(velecsum,kernel_data->energygrp_elec+ggid);
+        gmx_fjsp_update_1pot_v2r8(vvdwsum,kernel_data->energygrp_vdw+ggid);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 26 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4_VF,outeriter*26 + inneriter*167);
+}
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecRF_VdwCSTab_GeomW4P1_F_sparc64_hpc_ace_double
+ * Electrostatics interaction: ReactionField
+ * VdW interaction:            CubicSplineTable
+ * Geometry:                   Water4-Particle
+ * Calculate force/pot:        Force
+ */
+void
+nb_kernel_ElecRF_VdwCSTab_GeomW4P1_F_sparc64_hpc_ace_double
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwioffset3;
+    _fjsp_v2r8       ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+    int              vdwjidx0A,vdwjidx0B;
+    _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    _fjsp_v2r8       dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    _fjsp_v2r8       dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    _fjsp_v2r8       dx30,dy30,dz30,rsq30,rinv30,rinvsq30,r30,qq30,c6_30,c12_30;
+    _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
+    _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
+    _fjsp_v2r8       rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF,twovfeps;
+    real             *vftab;
+    _fjsp_v2r8       itab_tmp;
+    _fjsp_v2r8       dummy_mask,cutoff_mask;
+    _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+    _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+    union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    krf              = gmx_fjsp_set1_v2r8(fr->ic->k_rf);
+    krf2             = gmx_fjsp_set1_v2r8(fr->ic->k_rf*2.0);
+    crf              = gmx_fjsp_set1_v2r8(fr->ic->c_rf);
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+
+    vftab            = kernel_data->table_vdw->data;
+    vftabscale       = gmx_fjsp_set1_v2r8(kernel_data->table_vdw->scale);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+    iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+    iq3              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+3]));
+    vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_fjsp_load_shift_and_4rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
+
+        fix0             = _fjsp_setzero_v2r8();
+        fiy0             = _fjsp_setzero_v2r8();
+        fiz0             = _fjsp_setzero_v2r8();
+        fix1             = _fjsp_setzero_v2r8();
+        fiy1             = _fjsp_setzero_v2r8();
+        fiz1             = _fjsp_setzero_v2r8();
+        fix2             = _fjsp_setzero_v2r8();
+        fiy2             = _fjsp_setzero_v2r8();
+        fiz2             = _fjsp_setzero_v2r8();
+        fix3             = _fjsp_setzero_v2r8();
+        fiy3             = _fjsp_setzero_v2r8();
+        fiz3             = _fjsp_setzero_v2r8();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+            dx30             = _fjsp_sub_v2r8(ix3,jx0);
+            dy30             = _fjsp_sub_v2r8(iy3,jy0);
+            dz30             = _fjsp_sub_v2r8(iz3,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+            rsq30            = gmx_fjsp_calc_rsq_v2r8(dx30,dy30,dz30);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+            rinv30           = gmx_fjsp_invsqrt_v2r8(rsq30);
+
+            rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+            rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+            rinvsq30         = _fjsp_mul_v2r8(rinv30,rinv30);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_fjsp_load_2real_swizzle_v2r8(charge+jnrA+0,charge+jnrB+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            gmx_fjsp_load_2pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,&c6_00,&c12_00);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r00,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 8;
+            vfconv.i[1]     *= 8;
+
+            /* CUBIC SPLINE TABLE DISPERSION */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 2 );
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 2 );
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+            FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+            fvdw6            = _fjsp_mul_v2r8(c6_00,FF);
+
+            /* CUBIC SPLINE TABLE REPULSION */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 4 );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 4 );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 6 );
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 6 );
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+            FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+            fvdw12           = _fjsp_mul_v2r8(c12_00,FF);
+            fvdw             = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_add_v2r8(fvdw6,fvdw12),_fjsp_mul_v2r8(vftabscale,rinv00)));
+
+            fscal            = fvdw;
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _fjsp_mul_v2r8(iq1,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq10,_fjsp_msub_v2r8(rinv10,rinvsq10,krf2));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _fjsp_mul_v2r8(iq2,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq20,_fjsp_msub_v2r8(rinv20,rinvsq20,krf2));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq30             = _fjsp_mul_v2r8(iq3,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq30,_fjsp_msub_v2r8(rinv30,rinvsq30,krf2));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx30,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy30,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz30,fscal,fiz3);
+            
+            fjx0             = _fjsp_madd_v2r8(dx30,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy30,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz30,fscal,fjz0);
+
+            gmx_fjsp_decrement_1rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 144 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+            dx30             = _fjsp_sub_v2r8(ix3,jx0);
+            dy30             = _fjsp_sub_v2r8(iy3,jy0);
+            dz30             = _fjsp_sub_v2r8(iz3,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+            rsq30            = gmx_fjsp_calc_rsq_v2r8(dx30,dy30,dz30);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+            rinv30           = gmx_fjsp_invsqrt_v2r8(rsq30);
+
+            rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+            rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+            rinvsq30         = _fjsp_mul_v2r8(rinv30,rinv30);
+
+            /* Load parameters for j particles */
+            jq0              = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),charge+jnrA+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            gmx_fjsp_load_1pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,&c6_00,&c12_00);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r00,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 8;
+            vfconv.i[1]     *= 8;
+
+            /* CUBIC SPLINE TABLE DISPERSION */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 2 );
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+            FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+            fvdw6            = _fjsp_mul_v2r8(c6_00,FF);
+
+            /* CUBIC SPLINE TABLE REPULSION */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 4 );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 6 );
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+            FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+            fvdw12           = _fjsp_mul_v2r8(c12_00,FF);
+            fvdw             = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_add_v2r8(fvdw6,fvdw12),_fjsp_mul_v2r8(vftabscale,rinv00)));
+
+            fscal            = fvdw;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _fjsp_mul_v2r8(iq1,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq10,_fjsp_msub_v2r8(rinv10,rinvsq10,krf2));
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _fjsp_mul_v2r8(iq2,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq20,_fjsp_msub_v2r8(rinv20,rinvsq20,krf2));
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq30             = _fjsp_mul_v2r8(iq3,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq30,_fjsp_msub_v2r8(rinv30,rinvsq30,krf2));
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx30,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy30,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz30,fscal,fiz3);
+            
+            fjx0             = _fjsp_madd_v2r8(dx30,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy30,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz30,fscal,fjz0);
+
+            gmx_fjsp_decrement_1rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 144 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_fjsp_update_iforce_4atom_swizzle_v2r8(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 24 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4_F,outeriter*24 + inneriter*144);
+}
diff --git a/src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecRF_VdwCSTab_GeomW4W4_sparc64_hpc_ace_double.c b/src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecRF_VdwCSTab_GeomW4W4_sparc64_hpc_ace_double.c
new file mode 100644 (file)
index 0000000..1688019
--- /dev/null
@@ -0,0 +1,1779 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2012, by the GROMACS development team, led by
+ * David van der Spoel, Berk Hess, Erik Lindahl, and including many
+ * others, as listed in the AUTHORS file in the top-level source
+ * directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*
+ * Note: this file was generated by the GROMACS sparc64_hpc_ace_double kernel generator.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "kernelutil_sparc64_hpc_ace_double.h"
+
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecRF_VdwCSTab_GeomW4W4_VF_sparc64_hpc_ace_double
+ * Electrostatics interaction: ReactionField
+ * VdW interaction:            CubicSplineTable
+ * Geometry:                   Water4-Water4
+ * Calculate force/pot:        PotentialAndForce
+ */
+void
+nb_kernel_ElecRF_VdwCSTab_GeomW4W4_VF_sparc64_hpc_ace_double
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwioffset3;
+    _fjsp_v2r8       ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+    int              vdwjidx0A,vdwjidx0B;
+    _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    int              vdwjidx1A,vdwjidx1B;
+    _fjsp_v2r8       jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+    int              vdwjidx2A,vdwjidx2B;
+    _fjsp_v2r8       jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+    int              vdwjidx3A,vdwjidx3B;
+    _fjsp_v2r8       jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
+    _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    _fjsp_v2r8       dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+    _fjsp_v2r8       dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+    _fjsp_v2r8       dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
+    _fjsp_v2r8       dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+    _fjsp_v2r8       dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+    _fjsp_v2r8       dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
+    _fjsp_v2r8       dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
+    _fjsp_v2r8       dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
+    _fjsp_v2r8       dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
+    _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
+    _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
+    _fjsp_v2r8       rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF,twovfeps;
+    real             *vftab;
+    _fjsp_v2r8       itab_tmp;
+    _fjsp_v2r8       dummy_mask,cutoff_mask;
+    _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+    _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+    union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    krf              = gmx_fjsp_set1_v2r8(fr->ic->k_rf);
+    krf2             = gmx_fjsp_set1_v2r8(fr->ic->k_rf*2.0);
+    crf              = gmx_fjsp_set1_v2r8(fr->ic->c_rf);
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+
+    vftab            = kernel_data->table_vdw->data;
+    vftabscale       = gmx_fjsp_set1_v2r8(kernel_data->table_vdw->scale);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+    iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+    iq3              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+3]));
+    vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+    jq1              = gmx_fjsp_set1_v2r8(charge[inr+1]);
+    jq2              = gmx_fjsp_set1_v2r8(charge[inr+2]);
+    jq3              = gmx_fjsp_set1_v2r8(charge[inr+3]);
+    vdwjidx0A        = 2*vdwtype[inr+0];
+    c6_00            = gmx_fjsp_set1_v2r8(vdwparam[vdwioffset0+vdwjidx0A]);
+    c12_00           = gmx_fjsp_set1_v2r8(vdwparam[vdwioffset0+vdwjidx0A+1]);
+    qq11             = _fjsp_mul_v2r8(iq1,jq1);
+    qq12             = _fjsp_mul_v2r8(iq1,jq2);
+    qq13             = _fjsp_mul_v2r8(iq1,jq3);
+    qq21             = _fjsp_mul_v2r8(iq2,jq1);
+    qq22             = _fjsp_mul_v2r8(iq2,jq2);
+    qq23             = _fjsp_mul_v2r8(iq2,jq3);
+    qq31             = _fjsp_mul_v2r8(iq3,jq1);
+    qq32             = _fjsp_mul_v2r8(iq3,jq2);
+    qq33             = _fjsp_mul_v2r8(iq3,jq3);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_fjsp_load_shift_and_4rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
+
+        fix0             = _fjsp_setzero_v2r8();
+        fiy0             = _fjsp_setzero_v2r8();
+        fiz0             = _fjsp_setzero_v2r8();
+        fix1             = _fjsp_setzero_v2r8();
+        fiy1             = _fjsp_setzero_v2r8();
+        fiz1             = _fjsp_setzero_v2r8();
+        fix2             = _fjsp_setzero_v2r8();
+        fiy2             = _fjsp_setzero_v2r8();
+        fiz2             = _fjsp_setzero_v2r8();
+        fix3             = _fjsp_setzero_v2r8();
+        fiy3             = _fjsp_setzero_v2r8();
+        fiz3             = _fjsp_setzero_v2r8();
+
+        /* Reset potential sums */
+        velecsum         = _fjsp_setzero_v2r8();
+        vvdwsum          = _fjsp_setzero_v2r8();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_4rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
+                                              &jy2,&jz2,&jx3,&jy3,&jz3);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx11             = _fjsp_sub_v2r8(ix1,jx1);
+            dy11             = _fjsp_sub_v2r8(iy1,jy1);
+            dz11             = _fjsp_sub_v2r8(iz1,jz1);
+            dx12             = _fjsp_sub_v2r8(ix1,jx2);
+            dy12             = _fjsp_sub_v2r8(iy1,jy2);
+            dz12             = _fjsp_sub_v2r8(iz1,jz2);
+            dx13             = _fjsp_sub_v2r8(ix1,jx3);
+            dy13             = _fjsp_sub_v2r8(iy1,jy3);
+            dz13             = _fjsp_sub_v2r8(iz1,jz3);
+            dx21             = _fjsp_sub_v2r8(ix2,jx1);
+            dy21             = _fjsp_sub_v2r8(iy2,jy1);
+            dz21             = _fjsp_sub_v2r8(iz2,jz1);
+            dx22             = _fjsp_sub_v2r8(ix2,jx2);
+            dy22             = _fjsp_sub_v2r8(iy2,jy2);
+            dz22             = _fjsp_sub_v2r8(iz2,jz2);
+            dx23             = _fjsp_sub_v2r8(ix2,jx3);
+            dy23             = _fjsp_sub_v2r8(iy2,jy3);
+            dz23             = _fjsp_sub_v2r8(iz2,jz3);
+            dx31             = _fjsp_sub_v2r8(ix3,jx1);
+            dy31             = _fjsp_sub_v2r8(iy3,jy1);
+            dz31             = _fjsp_sub_v2r8(iz3,jz1);
+            dx32             = _fjsp_sub_v2r8(ix3,jx2);
+            dy32             = _fjsp_sub_v2r8(iy3,jy2);
+            dz32             = _fjsp_sub_v2r8(iz3,jz2);
+            dx33             = _fjsp_sub_v2r8(ix3,jx3);
+            dy33             = _fjsp_sub_v2r8(iy3,jy3);
+            dz33             = _fjsp_sub_v2r8(iz3,jz3);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+            rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+            rsq13            = gmx_fjsp_calc_rsq_v2r8(dx13,dy13,dz13);
+            rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+            rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+            rsq23            = gmx_fjsp_calc_rsq_v2r8(dx23,dy23,dz23);
+            rsq31            = gmx_fjsp_calc_rsq_v2r8(dx31,dy31,dz31);
+            rsq32            = gmx_fjsp_calc_rsq_v2r8(dx32,dy32,dz32);
+            rsq33            = gmx_fjsp_calc_rsq_v2r8(dx33,dy33,dz33);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+            rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+            rinv13           = gmx_fjsp_invsqrt_v2r8(rsq13);
+            rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+            rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+            rinv23           = gmx_fjsp_invsqrt_v2r8(rsq23);
+            rinv31           = gmx_fjsp_invsqrt_v2r8(rsq31);
+            rinv32           = gmx_fjsp_invsqrt_v2r8(rsq32);
+            rinv33           = gmx_fjsp_invsqrt_v2r8(rsq33);
+
+            rinvsq11         = _fjsp_mul_v2r8(rinv11,rinv11);
+            rinvsq12         = _fjsp_mul_v2r8(rinv12,rinv12);
+            rinvsq13         = _fjsp_mul_v2r8(rinv13,rinv13);
+            rinvsq21         = _fjsp_mul_v2r8(rinv21,rinv21);
+            rinvsq22         = _fjsp_mul_v2r8(rinv22,rinv22);
+            rinvsq23         = _fjsp_mul_v2r8(rinv23,rinv23);
+            rinvsq31         = _fjsp_mul_v2r8(rinv31,rinv31);
+            rinvsq32         = _fjsp_mul_v2r8(rinv32,rinv32);
+            rinvsq33         = _fjsp_mul_v2r8(rinv33,rinv33);
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+            fjx1             = _fjsp_setzero_v2r8();
+            fjy1             = _fjsp_setzero_v2r8();
+            fjz1             = _fjsp_setzero_v2r8();
+            fjx2             = _fjsp_setzero_v2r8();
+            fjy2             = _fjsp_setzero_v2r8();
+            fjz2             = _fjsp_setzero_v2r8();
+            fjx3             = _fjsp_setzero_v2r8();
+            fjy3             = _fjsp_setzero_v2r8();
+            fjz3             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r00,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 8;
+            vfconv.i[1]     *= 8;
+
+            /* CUBIC SPLINE TABLE DISPERSION */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 2 );
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 2 );
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            vvdw6            = _fjsp_mul_v2r8(c6_00,VV);
+            FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+            fvdw6            = _fjsp_mul_v2r8(c6_00,FF);
+
+            /* CUBIC SPLINE TABLE REPULSION */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 4 );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 4 );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 6 );
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 6 );
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            vvdw12           = _fjsp_mul_v2r8(c12_00,VV);
+            FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+            fvdw12           = _fjsp_mul_v2r8(c12_00,FF);
+            vvdw             = _fjsp_add_v2r8(vvdw12,vvdw6);
+            fvdw             = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_add_v2r8(fvdw6,fvdw12),_fjsp_mul_v2r8(vftabscale,rinv00)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
+
+            fscal            = fvdw;
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq11,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq11,rinv11),crf));
+            felec            = _fjsp_mul_v2r8(qq11,_fjsp_msub_v2r8(rinv11,rinvsq11,krf2));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+            
+            fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq12,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq12,rinv12),crf));
+            felec            = _fjsp_mul_v2r8(qq12,_fjsp_msub_v2r8(rinv12,rinvsq12,krf2));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+            
+            fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq13,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq13,rinv13),crf));
+            felec            = _fjsp_mul_v2r8(qq13,_fjsp_msub_v2r8(rinv13,rinvsq13,krf2));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx13,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy13,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz13,fscal,fiz1);
+            
+            fjx3             = _fjsp_madd_v2r8(dx13,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy13,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz13,fscal,fjz3);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq21,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq21,rinv21),crf));
+            felec            = _fjsp_mul_v2r8(qq21,_fjsp_msub_v2r8(rinv21,rinvsq21,krf2));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+            
+            fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq22,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq22,rinv22),crf));
+            felec            = _fjsp_mul_v2r8(qq22,_fjsp_msub_v2r8(rinv22,rinvsq22,krf2));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+            
+            fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq23,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq23,rinv23),crf));
+            felec            = _fjsp_mul_v2r8(qq23,_fjsp_msub_v2r8(rinv23,rinvsq23,krf2));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx23,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy23,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz23,fscal,fiz2);
+            
+            fjx3             = _fjsp_madd_v2r8(dx23,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy23,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz23,fscal,fjz3);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq31,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq31,rinv31),crf));
+            felec            = _fjsp_mul_v2r8(qq31,_fjsp_msub_v2r8(rinv31,rinvsq31,krf2));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx31,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy31,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz31,fscal,fiz3);
+            
+            fjx1             = _fjsp_madd_v2r8(dx31,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy31,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz31,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq32,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq32,rinv32),crf));
+            felec            = _fjsp_mul_v2r8(qq32,_fjsp_msub_v2r8(rinv32,rinvsq32,krf2));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx32,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy32,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz32,fscal,fiz3);
+            
+            fjx2             = _fjsp_madd_v2r8(dx32,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy32,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz32,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq33,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq33,rinv33),crf));
+            felec            = _fjsp_mul_v2r8(qq33,_fjsp_msub_v2r8(rinv33,rinvsq33,krf2));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx33,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy33,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz33,fscal,fiz3);
+            
+            fjx3             = _fjsp_madd_v2r8(dx33,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy33,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz33,fscal,fjz3);
+
+            gmx_fjsp_decrement_4rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+
+            /* Inner loop uses 377 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_4rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
+                                              &jy2,&jz2,&jx3,&jy3,&jz3);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx11             = _fjsp_sub_v2r8(ix1,jx1);
+            dy11             = _fjsp_sub_v2r8(iy1,jy1);
+            dz11             = _fjsp_sub_v2r8(iz1,jz1);
+            dx12             = _fjsp_sub_v2r8(ix1,jx2);
+            dy12             = _fjsp_sub_v2r8(iy1,jy2);
+            dz12             = _fjsp_sub_v2r8(iz1,jz2);
+            dx13             = _fjsp_sub_v2r8(ix1,jx3);
+            dy13             = _fjsp_sub_v2r8(iy1,jy3);
+            dz13             = _fjsp_sub_v2r8(iz1,jz3);
+            dx21             = _fjsp_sub_v2r8(ix2,jx1);
+            dy21             = _fjsp_sub_v2r8(iy2,jy1);
+            dz21             = _fjsp_sub_v2r8(iz2,jz1);
+            dx22             = _fjsp_sub_v2r8(ix2,jx2);
+            dy22             = _fjsp_sub_v2r8(iy2,jy2);
+            dz22             = _fjsp_sub_v2r8(iz2,jz2);
+            dx23             = _fjsp_sub_v2r8(ix2,jx3);
+            dy23             = _fjsp_sub_v2r8(iy2,jy3);
+            dz23             = _fjsp_sub_v2r8(iz2,jz3);
+            dx31             = _fjsp_sub_v2r8(ix3,jx1);
+            dy31             = _fjsp_sub_v2r8(iy3,jy1);
+            dz31             = _fjsp_sub_v2r8(iz3,jz1);
+            dx32             = _fjsp_sub_v2r8(ix3,jx2);
+            dy32             = _fjsp_sub_v2r8(iy3,jy2);
+            dz32             = _fjsp_sub_v2r8(iz3,jz2);
+            dx33             = _fjsp_sub_v2r8(ix3,jx3);
+            dy33             = _fjsp_sub_v2r8(iy3,jy3);
+            dz33             = _fjsp_sub_v2r8(iz3,jz3);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+            rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+            rsq13            = gmx_fjsp_calc_rsq_v2r8(dx13,dy13,dz13);
+            rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+            rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+            rsq23            = gmx_fjsp_calc_rsq_v2r8(dx23,dy23,dz23);
+            rsq31            = gmx_fjsp_calc_rsq_v2r8(dx31,dy31,dz31);
+            rsq32            = gmx_fjsp_calc_rsq_v2r8(dx32,dy32,dz32);
+            rsq33            = gmx_fjsp_calc_rsq_v2r8(dx33,dy33,dz33);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+            rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+            rinv13           = gmx_fjsp_invsqrt_v2r8(rsq13);
+            rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+            rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+            rinv23           = gmx_fjsp_invsqrt_v2r8(rsq23);
+            rinv31           = gmx_fjsp_invsqrt_v2r8(rsq31);
+            rinv32           = gmx_fjsp_invsqrt_v2r8(rsq32);
+            rinv33           = gmx_fjsp_invsqrt_v2r8(rsq33);
+
+            rinvsq11         = _fjsp_mul_v2r8(rinv11,rinv11);
+            rinvsq12         = _fjsp_mul_v2r8(rinv12,rinv12);
+            rinvsq13         = _fjsp_mul_v2r8(rinv13,rinv13);
+            rinvsq21         = _fjsp_mul_v2r8(rinv21,rinv21);
+            rinvsq22         = _fjsp_mul_v2r8(rinv22,rinv22);
+            rinvsq23         = _fjsp_mul_v2r8(rinv23,rinv23);
+            rinvsq31         = _fjsp_mul_v2r8(rinv31,rinv31);
+            rinvsq32         = _fjsp_mul_v2r8(rinv32,rinv32);
+            rinvsq33         = _fjsp_mul_v2r8(rinv33,rinv33);
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+            fjx1             = _fjsp_setzero_v2r8();
+            fjy1             = _fjsp_setzero_v2r8();
+            fjz1             = _fjsp_setzero_v2r8();
+            fjx2             = _fjsp_setzero_v2r8();
+            fjy2             = _fjsp_setzero_v2r8();
+            fjz2             = _fjsp_setzero_v2r8();
+            fjx3             = _fjsp_setzero_v2r8();
+            fjy3             = _fjsp_setzero_v2r8();
+            fjz3             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r00,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 8;
+            vfconv.i[1]     *= 8;
+
+            /* CUBIC SPLINE TABLE DISPERSION */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 2 );
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            vvdw6            = _fjsp_mul_v2r8(c6_00,VV);
+            FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+            fvdw6            = _fjsp_mul_v2r8(c6_00,FF);
+
+            /* CUBIC SPLINE TABLE REPULSION */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 4 );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 6 );
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            vvdw12           = _fjsp_mul_v2r8(c12_00,VV);
+            FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+            fvdw12           = _fjsp_mul_v2r8(c12_00,FF);
+            vvdw             = _fjsp_add_v2r8(vvdw12,vvdw6);
+            fvdw             = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_add_v2r8(fvdw6,fvdw12),_fjsp_mul_v2r8(vftabscale,rinv00)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            vvdw             = _fjsp_unpacklo_v2r8(vvdw,_fjsp_setzero_v2r8());
+            vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
+
+            fscal            = fvdw;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq11,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq11,rinv11),crf));
+            felec            = _fjsp_mul_v2r8(qq11,_fjsp_msub_v2r8(rinv11,rinvsq11,krf2));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+            
+            fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq12,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq12,rinv12),crf));
+            felec            = _fjsp_mul_v2r8(qq12,_fjsp_msub_v2r8(rinv12,rinvsq12,krf2));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+            
+            fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq13,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq13,rinv13),crf));
+            felec            = _fjsp_mul_v2r8(qq13,_fjsp_msub_v2r8(rinv13,rinvsq13,krf2));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx13,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy13,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz13,fscal,fiz1);
+            
+            fjx3             = _fjsp_madd_v2r8(dx13,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy13,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz13,fscal,fjz3);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq21,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq21,rinv21),crf));
+            felec            = _fjsp_mul_v2r8(qq21,_fjsp_msub_v2r8(rinv21,rinvsq21,krf2));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+            
+            fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq22,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq22,rinv22),crf));
+            felec            = _fjsp_mul_v2r8(qq22,_fjsp_msub_v2r8(rinv22,rinvsq22,krf2));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+            
+            fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq23,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq23,rinv23),crf));
+            felec            = _fjsp_mul_v2r8(qq23,_fjsp_msub_v2r8(rinv23,rinvsq23,krf2));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx23,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy23,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz23,fscal,fiz2);
+            
+            fjx3             = _fjsp_madd_v2r8(dx23,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy23,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz23,fscal,fjz3);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq31,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq31,rinv31),crf));
+            felec            = _fjsp_mul_v2r8(qq31,_fjsp_msub_v2r8(rinv31,rinvsq31,krf2));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx31,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy31,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz31,fscal,fiz3);
+            
+            fjx1             = _fjsp_madd_v2r8(dx31,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy31,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz31,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq32,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq32,rinv32),crf));
+            felec            = _fjsp_mul_v2r8(qq32,_fjsp_msub_v2r8(rinv32,rinvsq32,krf2));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx32,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy32,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz32,fscal,fiz3);
+            
+            fjx2             = _fjsp_madd_v2r8(dx32,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy32,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz32,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq33,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq33,rinv33),crf));
+            felec            = _fjsp_mul_v2r8(qq33,_fjsp_msub_v2r8(rinv33,rinvsq33,krf2));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx33,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy33,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz33,fscal,fiz3);
+            
+            fjx3             = _fjsp_madd_v2r8(dx33,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy33,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz33,fscal,fjz3);
+
+            gmx_fjsp_decrement_4rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+
+            /* Inner loop uses 377 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_fjsp_update_iforce_4atom_swizzle_v2r8(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        ggid                        = gid[iidx];
+        /* Update potential energies */
+        gmx_fjsp_update_1pot_v2r8(velecsum,kernel_data->energygrp_elec+ggid);
+        gmx_fjsp_update_1pot_v2r8(vvdwsum,kernel_data->energygrp_vdw+ggid);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 26 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_VF,outeriter*26 + inneriter*377);
+}
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecRF_VdwCSTab_GeomW4W4_F_sparc64_hpc_ace_double
+ * Electrostatics interaction: ReactionField
+ * VdW interaction:            CubicSplineTable
+ * Geometry:                   Water4-Water4
+ * Calculate force/pot:        Force
+ */
+void
+nb_kernel_ElecRF_VdwCSTab_GeomW4W4_F_sparc64_hpc_ace_double
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwioffset3;
+    _fjsp_v2r8       ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+    int              vdwjidx0A,vdwjidx0B;
+    _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    int              vdwjidx1A,vdwjidx1B;
+    _fjsp_v2r8       jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+    int              vdwjidx2A,vdwjidx2B;
+    _fjsp_v2r8       jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+    int              vdwjidx3A,vdwjidx3B;
+    _fjsp_v2r8       jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
+    _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    _fjsp_v2r8       dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+    _fjsp_v2r8       dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+    _fjsp_v2r8       dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
+    _fjsp_v2r8       dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+    _fjsp_v2r8       dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+    _fjsp_v2r8       dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
+    _fjsp_v2r8       dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
+    _fjsp_v2r8       dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
+    _fjsp_v2r8       dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
+    _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
+    _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
+    _fjsp_v2r8       rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF,twovfeps;
+    real             *vftab;
+    _fjsp_v2r8       itab_tmp;
+    _fjsp_v2r8       dummy_mask,cutoff_mask;
+    _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+    _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+    union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    krf              = gmx_fjsp_set1_v2r8(fr->ic->k_rf);
+    krf2             = gmx_fjsp_set1_v2r8(fr->ic->k_rf*2.0);
+    crf              = gmx_fjsp_set1_v2r8(fr->ic->c_rf);
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+
+    vftab            = kernel_data->table_vdw->data;
+    vftabscale       = gmx_fjsp_set1_v2r8(kernel_data->table_vdw->scale);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+    iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+    iq3              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+3]));
+    vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+    jq1              = gmx_fjsp_set1_v2r8(charge[inr+1]);
+    jq2              = gmx_fjsp_set1_v2r8(charge[inr+2]);
+    jq3              = gmx_fjsp_set1_v2r8(charge[inr+3]);
+    vdwjidx0A        = 2*vdwtype[inr+0];
+    c6_00            = gmx_fjsp_set1_v2r8(vdwparam[vdwioffset0+vdwjidx0A]);
+    c12_00           = gmx_fjsp_set1_v2r8(vdwparam[vdwioffset0+vdwjidx0A+1]);
+    qq11             = _fjsp_mul_v2r8(iq1,jq1);
+    qq12             = _fjsp_mul_v2r8(iq1,jq2);
+    qq13             = _fjsp_mul_v2r8(iq1,jq3);
+    qq21             = _fjsp_mul_v2r8(iq2,jq1);
+    qq22             = _fjsp_mul_v2r8(iq2,jq2);
+    qq23             = _fjsp_mul_v2r8(iq2,jq3);
+    qq31             = _fjsp_mul_v2r8(iq3,jq1);
+    qq32             = _fjsp_mul_v2r8(iq3,jq2);
+    qq33             = _fjsp_mul_v2r8(iq3,jq3);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_fjsp_load_shift_and_4rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
+
+        fix0             = _fjsp_setzero_v2r8();
+        fiy0             = _fjsp_setzero_v2r8();
+        fiz0             = _fjsp_setzero_v2r8();
+        fix1             = _fjsp_setzero_v2r8();
+        fiy1             = _fjsp_setzero_v2r8();
+        fiz1             = _fjsp_setzero_v2r8();
+        fix2             = _fjsp_setzero_v2r8();
+        fiy2             = _fjsp_setzero_v2r8();
+        fiz2             = _fjsp_setzero_v2r8();
+        fix3             = _fjsp_setzero_v2r8();
+        fiy3             = _fjsp_setzero_v2r8();
+        fiz3             = _fjsp_setzero_v2r8();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_4rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
+                                              &jy2,&jz2,&jx3,&jy3,&jz3);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx11             = _fjsp_sub_v2r8(ix1,jx1);
+            dy11             = _fjsp_sub_v2r8(iy1,jy1);
+            dz11             = _fjsp_sub_v2r8(iz1,jz1);
+            dx12             = _fjsp_sub_v2r8(ix1,jx2);
+            dy12             = _fjsp_sub_v2r8(iy1,jy2);
+            dz12             = _fjsp_sub_v2r8(iz1,jz2);
+            dx13             = _fjsp_sub_v2r8(ix1,jx3);
+            dy13             = _fjsp_sub_v2r8(iy1,jy3);
+            dz13             = _fjsp_sub_v2r8(iz1,jz3);
+            dx21             = _fjsp_sub_v2r8(ix2,jx1);
+            dy21             = _fjsp_sub_v2r8(iy2,jy1);
+            dz21             = _fjsp_sub_v2r8(iz2,jz1);
+            dx22             = _fjsp_sub_v2r8(ix2,jx2);
+            dy22             = _fjsp_sub_v2r8(iy2,jy2);
+            dz22             = _fjsp_sub_v2r8(iz2,jz2);
+            dx23             = _fjsp_sub_v2r8(ix2,jx3);
+            dy23             = _fjsp_sub_v2r8(iy2,jy3);
+            dz23             = _fjsp_sub_v2r8(iz2,jz3);
+            dx31             = _fjsp_sub_v2r8(ix3,jx1);
+            dy31             = _fjsp_sub_v2r8(iy3,jy1);
+            dz31             = _fjsp_sub_v2r8(iz3,jz1);
+            dx32             = _fjsp_sub_v2r8(ix3,jx2);
+            dy32             = _fjsp_sub_v2r8(iy3,jy2);
+            dz32             = _fjsp_sub_v2r8(iz3,jz2);
+            dx33             = _fjsp_sub_v2r8(ix3,jx3);
+            dy33             = _fjsp_sub_v2r8(iy3,jy3);
+            dz33             = _fjsp_sub_v2r8(iz3,jz3);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+            rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+            rsq13            = gmx_fjsp_calc_rsq_v2r8(dx13,dy13,dz13);
+            rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+            rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+            rsq23            = gmx_fjsp_calc_rsq_v2r8(dx23,dy23,dz23);
+            rsq31            = gmx_fjsp_calc_rsq_v2r8(dx31,dy31,dz31);
+            rsq32            = gmx_fjsp_calc_rsq_v2r8(dx32,dy32,dz32);
+            rsq33            = gmx_fjsp_calc_rsq_v2r8(dx33,dy33,dz33);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+            rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+            rinv13           = gmx_fjsp_invsqrt_v2r8(rsq13);
+            rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+            rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+            rinv23           = gmx_fjsp_invsqrt_v2r8(rsq23);
+            rinv31           = gmx_fjsp_invsqrt_v2r8(rsq31);
+            rinv32           = gmx_fjsp_invsqrt_v2r8(rsq32);
+            rinv33           = gmx_fjsp_invsqrt_v2r8(rsq33);
+
+            rinvsq11         = _fjsp_mul_v2r8(rinv11,rinv11);
+            rinvsq12         = _fjsp_mul_v2r8(rinv12,rinv12);
+            rinvsq13         = _fjsp_mul_v2r8(rinv13,rinv13);
+            rinvsq21         = _fjsp_mul_v2r8(rinv21,rinv21);
+            rinvsq22         = _fjsp_mul_v2r8(rinv22,rinv22);
+            rinvsq23         = _fjsp_mul_v2r8(rinv23,rinv23);
+            rinvsq31         = _fjsp_mul_v2r8(rinv31,rinv31);
+            rinvsq32         = _fjsp_mul_v2r8(rinv32,rinv32);
+            rinvsq33         = _fjsp_mul_v2r8(rinv33,rinv33);
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+            fjx1             = _fjsp_setzero_v2r8();
+            fjy1             = _fjsp_setzero_v2r8();
+            fjz1             = _fjsp_setzero_v2r8();
+            fjx2             = _fjsp_setzero_v2r8();
+            fjy2             = _fjsp_setzero_v2r8();
+            fjz2             = _fjsp_setzero_v2r8();
+            fjx3             = _fjsp_setzero_v2r8();
+            fjy3             = _fjsp_setzero_v2r8();
+            fjz3             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r00,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 8;
+            vfconv.i[1]     *= 8;
+
+            /* CUBIC SPLINE TABLE DISPERSION */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 2 );
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 2 );
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+            FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+            fvdw6            = _fjsp_mul_v2r8(c6_00,FF);
+
+            /* CUBIC SPLINE TABLE REPULSION */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 4 );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 4 );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 6 );
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 6 );
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+            FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+            fvdw12           = _fjsp_mul_v2r8(c12_00,FF);
+            fvdw             = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_add_v2r8(fvdw6,fvdw12),_fjsp_mul_v2r8(vftabscale,rinv00)));
+
+            fscal            = fvdw;
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq11,_fjsp_msub_v2r8(rinv11,rinvsq11,krf2));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+            
+            fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq12,_fjsp_msub_v2r8(rinv12,rinvsq12,krf2));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+            
+            fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq13,_fjsp_msub_v2r8(rinv13,rinvsq13,krf2));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx13,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy13,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz13,fscal,fiz1);
+            
+            fjx3             = _fjsp_madd_v2r8(dx13,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy13,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz13,fscal,fjz3);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq21,_fjsp_msub_v2r8(rinv21,rinvsq21,krf2));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+            
+            fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq22,_fjsp_msub_v2r8(rinv22,rinvsq22,krf2));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+            
+            fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq23,_fjsp_msub_v2r8(rinv23,rinvsq23,krf2));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx23,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy23,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz23,fscal,fiz2);
+            
+            fjx3             = _fjsp_madd_v2r8(dx23,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy23,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz23,fscal,fjz3);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq31,_fjsp_msub_v2r8(rinv31,rinvsq31,krf2));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx31,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy31,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz31,fscal,fiz3);
+            
+            fjx1             = _fjsp_madd_v2r8(dx31,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy31,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz31,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq32,_fjsp_msub_v2r8(rinv32,rinvsq32,krf2));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx32,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy32,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz32,fscal,fiz3);
+            
+            fjx2             = _fjsp_madd_v2r8(dx32,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy32,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz32,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq33,_fjsp_msub_v2r8(rinv33,rinvsq33,krf2));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx33,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy33,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz33,fscal,fiz3);
+            
+            fjx3             = _fjsp_madd_v2r8(dx33,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy33,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz33,fscal,fjz3);
+
+            gmx_fjsp_decrement_4rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+
+            /* Inner loop uses 324 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_4rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
+                                              &jy2,&jz2,&jx3,&jy3,&jz3);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx11             = _fjsp_sub_v2r8(ix1,jx1);
+            dy11             = _fjsp_sub_v2r8(iy1,jy1);
+            dz11             = _fjsp_sub_v2r8(iz1,jz1);
+            dx12             = _fjsp_sub_v2r8(ix1,jx2);
+            dy12             = _fjsp_sub_v2r8(iy1,jy2);
+            dz12             = _fjsp_sub_v2r8(iz1,jz2);
+            dx13             = _fjsp_sub_v2r8(ix1,jx3);
+            dy13             = _fjsp_sub_v2r8(iy1,jy3);
+            dz13             = _fjsp_sub_v2r8(iz1,jz3);
+            dx21             = _fjsp_sub_v2r8(ix2,jx1);
+            dy21             = _fjsp_sub_v2r8(iy2,jy1);
+            dz21             = _fjsp_sub_v2r8(iz2,jz1);
+            dx22             = _fjsp_sub_v2r8(ix2,jx2);
+            dy22             = _fjsp_sub_v2r8(iy2,jy2);
+            dz22             = _fjsp_sub_v2r8(iz2,jz2);
+            dx23             = _fjsp_sub_v2r8(ix2,jx3);
+            dy23             = _fjsp_sub_v2r8(iy2,jy3);
+            dz23             = _fjsp_sub_v2r8(iz2,jz3);
+            dx31             = _fjsp_sub_v2r8(ix3,jx1);
+            dy31             = _fjsp_sub_v2r8(iy3,jy1);
+            dz31             = _fjsp_sub_v2r8(iz3,jz1);
+            dx32             = _fjsp_sub_v2r8(ix3,jx2);
+            dy32             = _fjsp_sub_v2r8(iy3,jy2);
+            dz32             = _fjsp_sub_v2r8(iz3,jz2);
+            dx33             = _fjsp_sub_v2r8(ix3,jx3);
+            dy33             = _fjsp_sub_v2r8(iy3,jy3);
+            dz33             = _fjsp_sub_v2r8(iz3,jz3);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+            rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+            rsq13            = gmx_fjsp_calc_rsq_v2r8(dx13,dy13,dz13);
+            rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+            rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+            rsq23            = gmx_fjsp_calc_rsq_v2r8(dx23,dy23,dz23);
+            rsq31            = gmx_fjsp_calc_rsq_v2r8(dx31,dy31,dz31);
+            rsq32            = gmx_fjsp_calc_rsq_v2r8(dx32,dy32,dz32);
+            rsq33            = gmx_fjsp_calc_rsq_v2r8(dx33,dy33,dz33);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+            rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+            rinv13           = gmx_fjsp_invsqrt_v2r8(rsq13);
+            rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+            rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+            rinv23           = gmx_fjsp_invsqrt_v2r8(rsq23);
+            rinv31           = gmx_fjsp_invsqrt_v2r8(rsq31);
+            rinv32           = gmx_fjsp_invsqrt_v2r8(rsq32);
+            rinv33           = gmx_fjsp_invsqrt_v2r8(rsq33);
+
+            rinvsq11         = _fjsp_mul_v2r8(rinv11,rinv11);
+            rinvsq12         = _fjsp_mul_v2r8(rinv12,rinv12);
+            rinvsq13         = _fjsp_mul_v2r8(rinv13,rinv13);
+            rinvsq21         = _fjsp_mul_v2r8(rinv21,rinv21);
+            rinvsq22         = _fjsp_mul_v2r8(rinv22,rinv22);
+            rinvsq23         = _fjsp_mul_v2r8(rinv23,rinv23);
+            rinvsq31         = _fjsp_mul_v2r8(rinv31,rinv31);
+            rinvsq32         = _fjsp_mul_v2r8(rinv32,rinv32);
+            rinvsq33         = _fjsp_mul_v2r8(rinv33,rinv33);
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+            fjx1             = _fjsp_setzero_v2r8();
+            fjy1             = _fjsp_setzero_v2r8();
+            fjz1             = _fjsp_setzero_v2r8();
+            fjx2             = _fjsp_setzero_v2r8();
+            fjy2             = _fjsp_setzero_v2r8();
+            fjz2             = _fjsp_setzero_v2r8();
+            fjx3             = _fjsp_setzero_v2r8();
+            fjy3             = _fjsp_setzero_v2r8();
+            fjz3             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r00,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 8;
+            vfconv.i[1]     *= 8;
+
+            /* CUBIC SPLINE TABLE DISPERSION */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 2 );
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+            FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+            fvdw6            = _fjsp_mul_v2r8(c6_00,FF);
+
+            /* CUBIC SPLINE TABLE REPULSION */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 4 );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 6 );
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+            FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+            fvdw12           = _fjsp_mul_v2r8(c12_00,FF);
+            fvdw             = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_add_v2r8(fvdw6,fvdw12),_fjsp_mul_v2r8(vftabscale,rinv00)));
+
+            fscal            = fvdw;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq11,_fjsp_msub_v2r8(rinv11,rinvsq11,krf2));
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+            
+            fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq12,_fjsp_msub_v2r8(rinv12,rinvsq12,krf2));
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+            
+            fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq13,_fjsp_msub_v2r8(rinv13,rinvsq13,krf2));
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx13,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy13,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz13,fscal,fiz1);
+            
+            fjx3             = _fjsp_madd_v2r8(dx13,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy13,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz13,fscal,fjz3);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq21,_fjsp_msub_v2r8(rinv21,rinvsq21,krf2));
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+            
+            fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq22,_fjsp_msub_v2r8(rinv22,rinvsq22,krf2));
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+            
+            fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq23,_fjsp_msub_v2r8(rinv23,rinvsq23,krf2));
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx23,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy23,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz23,fscal,fiz2);
+            
+            fjx3             = _fjsp_madd_v2r8(dx23,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy23,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz23,fscal,fjz3);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq31,_fjsp_msub_v2r8(rinv31,rinvsq31,krf2));
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx31,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy31,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz31,fscal,fiz3);
+            
+            fjx1             = _fjsp_madd_v2r8(dx31,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy31,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz31,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq32,_fjsp_msub_v2r8(rinv32,rinvsq32,krf2));
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx32,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy32,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz32,fscal,fiz3);
+            
+            fjx2             = _fjsp_madd_v2r8(dx32,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy32,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz32,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq33,_fjsp_msub_v2r8(rinv33,rinvsq33,krf2));
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx33,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy33,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz33,fscal,fiz3);
+            
+            fjx3             = _fjsp_madd_v2r8(dx33,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy33,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz33,fscal,fjz3);
+
+            gmx_fjsp_decrement_4rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+
+            /* Inner loop uses 324 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_fjsp_update_iforce_4atom_swizzle_v2r8(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 24 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_F,outeriter*24 + inneriter*324);
+}
diff --git a/src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecRF_VdwLJ_GeomP1P1_sparc64_hpc_ace_double.c b/src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecRF_VdwLJ_GeomP1P1_sparc64_hpc_ace_double.c
new file mode 100644 (file)
index 0000000..dc84753
--- /dev/null
@@ -0,0 +1,549 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2012, by the GROMACS development team, led by
+ * David van der Spoel, Berk Hess, Erik Lindahl, and including many
+ * others, as listed in the AUTHORS file in the top-level source
+ * directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*
+ * Note: this file was generated by the GROMACS sparc64_hpc_ace_double kernel generator.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "kernelutil_sparc64_hpc_ace_double.h"
+
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecRF_VdwLJ_GeomP1P1_VF_sparc64_hpc_ace_double
+ * Electrostatics interaction: ReactionField
+ * VdW interaction:            LennardJones
+ * Geometry:                   Particle-Particle
+ * Calculate force/pot:        PotentialAndForce
+ */
+void
+nb_kernel_ElecRF_VdwLJ_GeomP1P1_VF_sparc64_hpc_ace_double
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwjidx0A,vdwjidx0B;
+    _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
+    _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
+    _fjsp_v2r8       itab_tmp;
+    _fjsp_v2r8       dummy_mask,cutoff_mask;
+    _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+    _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+    union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    krf              = gmx_fjsp_set1_v2r8(fr->ic->k_rf);
+    krf2             = gmx_fjsp_set1_v2r8(fr->ic->k_rf*2.0);
+    crf              = gmx_fjsp_set1_v2r8(fr->ic->c_rf);
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_fjsp_load_shift_and_1rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,&ix0,&iy0,&iz0);
+
+        fix0             = _fjsp_setzero_v2r8();
+        fiy0             = _fjsp_setzero_v2r8();
+        fiz0             = _fjsp_setzero_v2r8();
+
+        /* Load parameters for i particles */
+        iq0              = _fjsp_mul_v2r8(facel,gmx_fjsp_load1_v2r8(charge+inr+0));
+        vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+        /* Reset potential sums */
+        velecsum         = _fjsp_setzero_v2r8();
+        vvdwsum          = _fjsp_setzero_v2r8();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_fjsp_load_2real_swizzle_v2r8(charge+jnrA+0,charge+jnrB+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _fjsp_mul_v2r8(iq0,jq0);
+            gmx_fjsp_load_2pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,&c6_00,&c12_00);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq00,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq00,rinv00),crf));
+            felec            = _fjsp_mul_v2r8(qq00,_fjsp_msub_v2r8(rinv00,rinvsq00,krf2));
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+            vvdw6            = _fjsp_mul_v2r8(c6_00,rinvsix);
+            vvdw12           = _fjsp_mul_v2r8(c12_00,_fjsp_mul_v2r8(rinvsix,rinvsix));
+            vvdw             = _fjsp_msub_v2r8( vvdw12,one_twelfth, _fjsp_mul_v2r8(vvdw6,one_sixth) );
+            fvdw             = _fjsp_mul_v2r8(_fjsp_sub_v2r8(vvdw12,vvdw6),rinvsq00);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+            vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
+
+            fscal            = _fjsp_add_v2r8(felec,fvdw);
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            gmx_fjsp_decrement_fma_1rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fscal,dx00,dy00,dz00);
+
+            /* Inner loop uses 47 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+
+            /* Load parameters for j particles */
+            jq0              = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),charge+jnrA+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _fjsp_mul_v2r8(iq0,jq0);
+            gmx_fjsp_load_1pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,&c6_00,&c12_00);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq00,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq00,rinv00),crf));
+            felec            = _fjsp_mul_v2r8(qq00,_fjsp_msub_v2r8(rinv00,rinvsq00,krf2));
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+            vvdw6            = _fjsp_mul_v2r8(c6_00,rinvsix);
+            vvdw12           = _fjsp_mul_v2r8(c12_00,_fjsp_mul_v2r8(rinvsix,rinvsix));
+            vvdw             = _fjsp_msub_v2r8( vvdw12,one_twelfth, _fjsp_mul_v2r8(vvdw6,one_sixth) );
+            fvdw             = _fjsp_mul_v2r8(_fjsp_sub_v2r8(vvdw12,vvdw6),rinvsq00);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+            vvdw             = _fjsp_unpacklo_v2r8(vvdw,_fjsp_setzero_v2r8());
+            vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
+
+            fscal            = _fjsp_add_v2r8(felec,fvdw);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            gmx_fjsp_decrement_fma_1rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fscal,dx00,dy00,dz00);
+
+            /* Inner loop uses 47 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_fjsp_update_iforce_1atom_swizzle_v2r8(fix0,fiy0,fiz0,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        ggid                        = gid[iidx];
+        /* Update potential energies */
+        gmx_fjsp_update_1pot_v2r8(velecsum,kernel_data->energygrp_elec+ggid);
+        gmx_fjsp_update_1pot_v2r8(vvdwsum,kernel_data->energygrp_vdw+ggid);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 9 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_VF,outeriter*9 + inneriter*47);
+}
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecRF_VdwLJ_GeomP1P1_F_sparc64_hpc_ace_double
+ * Electrostatics interaction: ReactionField
+ * VdW interaction:            LennardJones
+ * Geometry:                   Particle-Particle
+ * Calculate force/pot:        Force
+ */
+void
+nb_kernel_ElecRF_VdwLJ_GeomP1P1_F_sparc64_hpc_ace_double
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwjidx0A,vdwjidx0B;
+    _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
+    _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
+    _fjsp_v2r8       itab_tmp;
+    _fjsp_v2r8       dummy_mask,cutoff_mask;
+    _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+    _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+    union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    krf              = gmx_fjsp_set1_v2r8(fr->ic->k_rf);
+    krf2             = gmx_fjsp_set1_v2r8(fr->ic->k_rf*2.0);
+    crf              = gmx_fjsp_set1_v2r8(fr->ic->c_rf);
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_fjsp_load_shift_and_1rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,&ix0,&iy0,&iz0);
+
+        fix0             = _fjsp_setzero_v2r8();
+        fiy0             = _fjsp_setzero_v2r8();
+        fiz0             = _fjsp_setzero_v2r8();
+
+        /* Load parameters for i particles */
+        iq0              = _fjsp_mul_v2r8(facel,gmx_fjsp_load1_v2r8(charge+inr+0));
+        vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_fjsp_load_2real_swizzle_v2r8(charge+jnrA+0,charge+jnrB+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _fjsp_mul_v2r8(iq0,jq0);
+            gmx_fjsp_load_2pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,&c6_00,&c12_00);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq00,_fjsp_msub_v2r8(rinv00,rinvsq00,krf2));
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+            fvdw             = _fjsp_mul_v2r8(_fjsp_msub_v2r8(c12_00,rinvsix,c6_00),_fjsp_mul_v2r8(rinvsix,rinvsq00));
+
+            fscal            = _fjsp_add_v2r8(felec,fvdw);
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            gmx_fjsp_decrement_fma_1rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fscal,dx00,dy00,dz00);
+
+            /* Inner loop uses 37 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+
+            /* Load parameters for j particles */
+            jq0              = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),charge+jnrA+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _fjsp_mul_v2r8(iq0,jq0);
+            gmx_fjsp_load_1pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,&c6_00,&c12_00);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq00,_fjsp_msub_v2r8(rinv00,rinvsq00,krf2));
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+            fvdw             = _fjsp_mul_v2r8(_fjsp_msub_v2r8(c12_00,rinvsix,c6_00),_fjsp_mul_v2r8(rinvsix,rinvsq00));
+
+            fscal            = _fjsp_add_v2r8(felec,fvdw);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            gmx_fjsp_decrement_fma_1rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fscal,dx00,dy00,dz00);
+
+            /* Inner loop uses 37 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_fjsp_update_iforce_1atom_swizzle_v2r8(fix0,fiy0,fiz0,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 7 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_F,outeriter*7 + inneriter*37);
+}
diff --git a/src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecRF_VdwLJ_GeomW3P1_sparc64_hpc_ace_double.c b/src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecRF_VdwLJ_GeomW3P1_sparc64_hpc_ace_double.c
new file mode 100644 (file)
index 0000000..13978d1
--- /dev/null
@@ -0,0 +1,855 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2012, by the GROMACS development team, led by
+ * David van der Spoel, Berk Hess, Erik Lindahl, and including many
+ * others, as listed in the AUTHORS file in the top-level source
+ * directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*
+ * Note: this file was generated by the GROMACS sparc64_hpc_ace_double kernel generator.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "kernelutil_sparc64_hpc_ace_double.h"
+
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecRF_VdwLJ_GeomW3P1_VF_sparc64_hpc_ace_double
+ * Electrostatics interaction: ReactionField
+ * VdW interaction:            LennardJones
+ * Geometry:                   Water3-Particle
+ * Calculate force/pot:        PotentialAndForce
+ */
+void
+nb_kernel_ElecRF_VdwLJ_GeomW3P1_VF_sparc64_hpc_ace_double
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwjidx0A,vdwjidx0B;
+    _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    _fjsp_v2r8       dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    _fjsp_v2r8       dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
+    _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
+    _fjsp_v2r8       itab_tmp;
+    _fjsp_v2r8       dummy_mask,cutoff_mask;
+    _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+    _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+    union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    krf              = gmx_fjsp_set1_v2r8(fr->ic->k_rf);
+    krf2             = gmx_fjsp_set1_v2r8(fr->ic->k_rf*2.0);
+    crf              = gmx_fjsp_set1_v2r8(fr->ic->c_rf);
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq0              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+0]));
+    iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+    iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+    vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_fjsp_load_shift_and_3rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
+
+        fix0             = _fjsp_setzero_v2r8();
+        fiy0             = _fjsp_setzero_v2r8();
+        fiz0             = _fjsp_setzero_v2r8();
+        fix1             = _fjsp_setzero_v2r8();
+        fiy1             = _fjsp_setzero_v2r8();
+        fiz1             = _fjsp_setzero_v2r8();
+        fix2             = _fjsp_setzero_v2r8();
+        fiy2             = _fjsp_setzero_v2r8();
+        fiz2             = _fjsp_setzero_v2r8();
+
+        /* Reset potential sums */
+        velecsum         = _fjsp_setzero_v2r8();
+        vvdwsum          = _fjsp_setzero_v2r8();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+            rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+            rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_fjsp_load_2real_swizzle_v2r8(charge+jnrA+0,charge+jnrB+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _fjsp_mul_v2r8(iq0,jq0);
+            gmx_fjsp_load_2pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,&c6_00,&c12_00);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq00,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq00,rinv00),crf));
+            felec            = _fjsp_mul_v2r8(qq00,_fjsp_msub_v2r8(rinv00,rinvsq00,krf2));
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+            vvdw6            = _fjsp_mul_v2r8(c6_00,rinvsix);
+            vvdw12           = _fjsp_mul_v2r8(c12_00,_fjsp_mul_v2r8(rinvsix,rinvsix));
+            vvdw             = _fjsp_msub_v2r8( vvdw12,one_twelfth, _fjsp_mul_v2r8(vvdw6,one_sixth) );
+            fvdw             = _fjsp_mul_v2r8(_fjsp_sub_v2r8(vvdw12,vvdw6),rinvsq00);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+            vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
+
+            fscal            = _fjsp_add_v2r8(felec,fvdw);
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _fjsp_mul_v2r8(iq1,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq10,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq10,rinv10),crf));
+            felec            = _fjsp_mul_v2r8(qq10,_fjsp_msub_v2r8(rinv10,rinvsq10,krf2));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _fjsp_mul_v2r8(iq2,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq20,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq20,rinv20),crf));
+            felec            = _fjsp_mul_v2r8(qq20,_fjsp_msub_v2r8(rinv20,rinvsq20,krf2));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            gmx_fjsp_decrement_1rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 120 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+            rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+            rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+
+            /* Load parameters for j particles */
+            jq0              = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),charge+jnrA+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _fjsp_mul_v2r8(iq0,jq0);
+            gmx_fjsp_load_1pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,&c6_00,&c12_00);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq00,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq00,rinv00),crf));
+            felec            = _fjsp_mul_v2r8(qq00,_fjsp_msub_v2r8(rinv00,rinvsq00,krf2));
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+            vvdw6            = _fjsp_mul_v2r8(c6_00,rinvsix);
+            vvdw12           = _fjsp_mul_v2r8(c12_00,_fjsp_mul_v2r8(rinvsix,rinvsix));
+            vvdw             = _fjsp_msub_v2r8( vvdw12,one_twelfth, _fjsp_mul_v2r8(vvdw6,one_sixth) );
+            fvdw             = _fjsp_mul_v2r8(_fjsp_sub_v2r8(vvdw12,vvdw6),rinvsq00);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+            vvdw             = _fjsp_unpacklo_v2r8(vvdw,_fjsp_setzero_v2r8());
+            vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
+
+            fscal            = _fjsp_add_v2r8(felec,fvdw);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _fjsp_mul_v2r8(iq1,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq10,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq10,rinv10),crf));
+            felec            = _fjsp_mul_v2r8(qq10,_fjsp_msub_v2r8(rinv10,rinvsq10,krf2));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _fjsp_mul_v2r8(iq2,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq20,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq20,rinv20),crf));
+            felec            = _fjsp_mul_v2r8(qq20,_fjsp_msub_v2r8(rinv20,rinvsq20,krf2));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            gmx_fjsp_decrement_1rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 120 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_fjsp_update_iforce_3atom_swizzle_v2r8(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        ggid                        = gid[iidx];
+        /* Update potential energies */
+        gmx_fjsp_update_1pot_v2r8(velecsum,kernel_data->energygrp_elec+ggid);
+        gmx_fjsp_update_1pot_v2r8(vvdwsum,kernel_data->energygrp_vdw+ggid);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 20 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3_VF,outeriter*20 + inneriter*120);
+}
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecRF_VdwLJ_GeomW3P1_F_sparc64_hpc_ace_double
+ * Electrostatics interaction: ReactionField
+ * VdW interaction:            LennardJones
+ * Geometry:                   Water3-Particle
+ * Calculate force/pot:        Force
+ */
+void
+nb_kernel_ElecRF_VdwLJ_GeomW3P1_F_sparc64_hpc_ace_double
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwjidx0A,vdwjidx0B;
+    _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    _fjsp_v2r8       dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    _fjsp_v2r8       dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
+    _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
+    _fjsp_v2r8       itab_tmp;
+    _fjsp_v2r8       dummy_mask,cutoff_mask;
+    _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+    _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+    union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    krf              = gmx_fjsp_set1_v2r8(fr->ic->k_rf);
+    krf2             = gmx_fjsp_set1_v2r8(fr->ic->k_rf*2.0);
+    crf              = gmx_fjsp_set1_v2r8(fr->ic->c_rf);
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq0              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+0]));
+    iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+    iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+    vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_fjsp_load_shift_and_3rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
+
+        fix0             = _fjsp_setzero_v2r8();
+        fiy0             = _fjsp_setzero_v2r8();
+        fiz0             = _fjsp_setzero_v2r8();
+        fix1             = _fjsp_setzero_v2r8();
+        fiy1             = _fjsp_setzero_v2r8();
+        fiz1             = _fjsp_setzero_v2r8();
+        fix2             = _fjsp_setzero_v2r8();
+        fiy2             = _fjsp_setzero_v2r8();
+        fiz2             = _fjsp_setzero_v2r8();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+            rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+            rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_fjsp_load_2real_swizzle_v2r8(charge+jnrA+0,charge+jnrB+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _fjsp_mul_v2r8(iq0,jq0);
+            gmx_fjsp_load_2pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,&c6_00,&c12_00);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq00,_fjsp_msub_v2r8(rinv00,rinvsq00,krf2));
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+            fvdw             = _fjsp_mul_v2r8(_fjsp_msub_v2r8(c12_00,rinvsix,c6_00),_fjsp_mul_v2r8(rinvsix,rinvsq00));
+
+            fscal            = _fjsp_add_v2r8(felec,fvdw);
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _fjsp_mul_v2r8(iq1,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq10,_fjsp_msub_v2r8(rinv10,rinvsq10,krf2));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _fjsp_mul_v2r8(iq2,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq20,_fjsp_msub_v2r8(rinv20,rinvsq20,krf2));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            gmx_fjsp_decrement_1rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 100 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+            rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+            rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+
+            /* Load parameters for j particles */
+            jq0              = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),charge+jnrA+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _fjsp_mul_v2r8(iq0,jq0);
+            gmx_fjsp_load_1pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,&c6_00,&c12_00);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq00,_fjsp_msub_v2r8(rinv00,rinvsq00,krf2));
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+            fvdw             = _fjsp_mul_v2r8(_fjsp_msub_v2r8(c12_00,rinvsix,c6_00),_fjsp_mul_v2r8(rinvsix,rinvsq00));
+
+            fscal            = _fjsp_add_v2r8(felec,fvdw);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _fjsp_mul_v2r8(iq1,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq10,_fjsp_msub_v2r8(rinv10,rinvsq10,krf2));
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _fjsp_mul_v2r8(iq2,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq20,_fjsp_msub_v2r8(rinv20,rinvsq20,krf2));
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            gmx_fjsp_decrement_1rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 100 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_fjsp_update_iforce_3atom_swizzle_v2r8(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 18 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3_F,outeriter*18 + inneriter*100);
+}
diff --git a/src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecRF_VdwLJ_GeomW3W3_sparc64_hpc_ace_double.c b/src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecRF_VdwLJ_GeomW3W3_sparc64_hpc_ace_double.c
new file mode 100644 (file)
index 0000000..b419e70
--- /dev/null
@@ -0,0 +1,1525 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2012, by the GROMACS development team, led by
+ * David van der Spoel, Berk Hess, Erik Lindahl, and including many
+ * others, as listed in the AUTHORS file in the top-level source
+ * directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*
+ * Note: this file was generated by the GROMACS sparc64_hpc_ace_double kernel generator.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "kernelutil_sparc64_hpc_ace_double.h"
+
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecRF_VdwLJ_GeomW3W3_VF_sparc64_hpc_ace_double
+ * Electrostatics interaction: ReactionField
+ * VdW interaction:            LennardJones
+ * Geometry:                   Water3-Water3
+ * Calculate force/pot:        PotentialAndForce
+ */
+void
+nb_kernel_ElecRF_VdwLJ_GeomW3W3_VF_sparc64_hpc_ace_double
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwjidx0A,vdwjidx0B;
+    _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    int              vdwjidx1A,vdwjidx1B;
+    _fjsp_v2r8       jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+    int              vdwjidx2A,vdwjidx2B;
+    _fjsp_v2r8       jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+    _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    _fjsp_v2r8       dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
+    _fjsp_v2r8       dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
+    _fjsp_v2r8       dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    _fjsp_v2r8       dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+    _fjsp_v2r8       dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+    _fjsp_v2r8       dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    _fjsp_v2r8       dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+    _fjsp_v2r8       dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+    _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
+    _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
+    _fjsp_v2r8       itab_tmp;
+    _fjsp_v2r8       dummy_mask,cutoff_mask;
+    _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+    _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+    union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    krf              = gmx_fjsp_set1_v2r8(fr->ic->k_rf);
+    krf2             = gmx_fjsp_set1_v2r8(fr->ic->k_rf*2.0);
+    crf              = gmx_fjsp_set1_v2r8(fr->ic->c_rf);
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq0              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+0]));
+    iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+    iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+    vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+    jq0              = gmx_fjsp_set1_v2r8(charge[inr+0]);
+    jq1              = gmx_fjsp_set1_v2r8(charge[inr+1]);
+    jq2              = gmx_fjsp_set1_v2r8(charge[inr+2]);
+    vdwjidx0A        = 2*vdwtype[inr+0];
+    qq00             = _fjsp_mul_v2r8(iq0,jq0);
+    c6_00            = gmx_fjsp_set1_v2r8(vdwparam[vdwioffset0+vdwjidx0A]);
+    c12_00           = gmx_fjsp_set1_v2r8(vdwparam[vdwioffset0+vdwjidx0A+1]);
+    qq01             = _fjsp_mul_v2r8(iq0,jq1);
+    qq02             = _fjsp_mul_v2r8(iq0,jq2);
+    qq10             = _fjsp_mul_v2r8(iq1,jq0);
+    qq11             = _fjsp_mul_v2r8(iq1,jq1);
+    qq12             = _fjsp_mul_v2r8(iq1,jq2);
+    qq20             = _fjsp_mul_v2r8(iq2,jq0);
+    qq21             = _fjsp_mul_v2r8(iq2,jq1);
+    qq22             = _fjsp_mul_v2r8(iq2,jq2);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_fjsp_load_shift_and_3rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
+
+        fix0             = _fjsp_setzero_v2r8();
+        fiy0             = _fjsp_setzero_v2r8();
+        fiz0             = _fjsp_setzero_v2r8();
+        fix1             = _fjsp_setzero_v2r8();
+        fiy1             = _fjsp_setzero_v2r8();
+        fiz1             = _fjsp_setzero_v2r8();
+        fix2             = _fjsp_setzero_v2r8();
+        fiy2             = _fjsp_setzero_v2r8();
+        fiz2             = _fjsp_setzero_v2r8();
+
+        /* Reset potential sums */
+        velecsum         = _fjsp_setzero_v2r8();
+        vvdwsum          = _fjsp_setzero_v2r8();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_3rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx01             = _fjsp_sub_v2r8(ix0,jx1);
+            dy01             = _fjsp_sub_v2r8(iy0,jy1);
+            dz01             = _fjsp_sub_v2r8(iz0,jz1);
+            dx02             = _fjsp_sub_v2r8(ix0,jx2);
+            dy02             = _fjsp_sub_v2r8(iy0,jy2);
+            dz02             = _fjsp_sub_v2r8(iz0,jz2);
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx11             = _fjsp_sub_v2r8(ix1,jx1);
+            dy11             = _fjsp_sub_v2r8(iy1,jy1);
+            dz11             = _fjsp_sub_v2r8(iz1,jz1);
+            dx12             = _fjsp_sub_v2r8(ix1,jx2);
+            dy12             = _fjsp_sub_v2r8(iy1,jy2);
+            dz12             = _fjsp_sub_v2r8(iz1,jz2);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+            dx21             = _fjsp_sub_v2r8(ix2,jx1);
+            dy21             = _fjsp_sub_v2r8(iy2,jy1);
+            dz21             = _fjsp_sub_v2r8(iz2,jz1);
+            dx22             = _fjsp_sub_v2r8(ix2,jx2);
+            dy22             = _fjsp_sub_v2r8(iy2,jy2);
+            dz22             = _fjsp_sub_v2r8(iz2,jz2);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq01            = gmx_fjsp_calc_rsq_v2r8(dx01,dy01,dz01);
+            rsq02            = gmx_fjsp_calc_rsq_v2r8(dx02,dy02,dz02);
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+            rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+            rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+            rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv01           = gmx_fjsp_invsqrt_v2r8(rsq01);
+            rinv02           = gmx_fjsp_invsqrt_v2r8(rsq02);
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+            rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+            rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+            rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+            rinvsq01         = _fjsp_mul_v2r8(rinv01,rinv01);
+            rinvsq02         = _fjsp_mul_v2r8(rinv02,rinv02);
+            rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+            rinvsq11         = _fjsp_mul_v2r8(rinv11,rinv11);
+            rinvsq12         = _fjsp_mul_v2r8(rinv12,rinv12);
+            rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+            rinvsq21         = _fjsp_mul_v2r8(rinv21,rinv21);
+            rinvsq22         = _fjsp_mul_v2r8(rinv22,rinv22);
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+            fjx1             = _fjsp_setzero_v2r8();
+            fjy1             = _fjsp_setzero_v2r8();
+            fjz1             = _fjsp_setzero_v2r8();
+            fjx2             = _fjsp_setzero_v2r8();
+            fjy2             = _fjsp_setzero_v2r8();
+            fjz2             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq00,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq00,rinv00),crf));
+            felec            = _fjsp_mul_v2r8(qq00,_fjsp_msub_v2r8(rinv00,rinvsq00,krf2));
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+            vvdw6            = _fjsp_mul_v2r8(c6_00,rinvsix);
+            vvdw12           = _fjsp_mul_v2r8(c12_00,_fjsp_mul_v2r8(rinvsix,rinvsix));
+            vvdw             = _fjsp_msub_v2r8( vvdw12,one_twelfth, _fjsp_mul_v2r8(vvdw6,one_sixth) );
+            fvdw             = _fjsp_mul_v2r8(_fjsp_sub_v2r8(vvdw12,vvdw6),rinvsq00);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+            vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
+
+            fscal            = _fjsp_add_v2r8(felec,fvdw);
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq01,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq01,rinv01),crf));
+            felec            = _fjsp_mul_v2r8(qq01,_fjsp_msub_v2r8(rinv01,rinvsq01,krf2));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx01,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy01,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz01,fscal,fiz0);
+            
+            fjx1             = _fjsp_madd_v2r8(dx01,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy01,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz01,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq02,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq02,rinv02),crf));
+            felec            = _fjsp_mul_v2r8(qq02,_fjsp_msub_v2r8(rinv02,rinvsq02,krf2));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx02,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy02,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz02,fscal,fiz0);
+            
+            fjx2             = _fjsp_madd_v2r8(dx02,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy02,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz02,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq10,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq10,rinv10),crf));
+            felec            = _fjsp_mul_v2r8(qq10,_fjsp_msub_v2r8(rinv10,rinvsq10,krf2));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq11,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq11,rinv11),crf));
+            felec            = _fjsp_mul_v2r8(qq11,_fjsp_msub_v2r8(rinv11,rinvsq11,krf2));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+            
+            fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq12,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq12,rinv12),crf));
+            felec            = _fjsp_mul_v2r8(qq12,_fjsp_msub_v2r8(rinv12,rinvsq12,krf2));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+            
+            fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq20,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq20,rinv20),crf));
+            felec            = _fjsp_mul_v2r8(qq20,_fjsp_msub_v2r8(rinv20,rinvsq20,krf2));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq21,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq21,rinv21),crf));
+            felec            = _fjsp_mul_v2r8(qq21,_fjsp_msub_v2r8(rinv21,rinvsq21,krf2));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+            
+            fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq22,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq22,rinv22),crf));
+            felec            = _fjsp_mul_v2r8(qq22,_fjsp_msub_v2r8(rinv22,rinvsq22,krf2));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+            
+            fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+
+            gmx_fjsp_decrement_3rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+
+            /* Inner loop uses 327 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_3rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx01             = _fjsp_sub_v2r8(ix0,jx1);
+            dy01             = _fjsp_sub_v2r8(iy0,jy1);
+            dz01             = _fjsp_sub_v2r8(iz0,jz1);
+            dx02             = _fjsp_sub_v2r8(ix0,jx2);
+            dy02             = _fjsp_sub_v2r8(iy0,jy2);
+            dz02             = _fjsp_sub_v2r8(iz0,jz2);
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx11             = _fjsp_sub_v2r8(ix1,jx1);
+            dy11             = _fjsp_sub_v2r8(iy1,jy1);
+            dz11             = _fjsp_sub_v2r8(iz1,jz1);
+            dx12             = _fjsp_sub_v2r8(ix1,jx2);
+            dy12             = _fjsp_sub_v2r8(iy1,jy2);
+            dz12             = _fjsp_sub_v2r8(iz1,jz2);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+            dx21             = _fjsp_sub_v2r8(ix2,jx1);
+            dy21             = _fjsp_sub_v2r8(iy2,jy1);
+            dz21             = _fjsp_sub_v2r8(iz2,jz1);
+            dx22             = _fjsp_sub_v2r8(ix2,jx2);
+            dy22             = _fjsp_sub_v2r8(iy2,jy2);
+            dz22             = _fjsp_sub_v2r8(iz2,jz2);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq01            = gmx_fjsp_calc_rsq_v2r8(dx01,dy01,dz01);
+            rsq02            = gmx_fjsp_calc_rsq_v2r8(dx02,dy02,dz02);
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+            rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+            rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+            rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv01           = gmx_fjsp_invsqrt_v2r8(rsq01);
+            rinv02           = gmx_fjsp_invsqrt_v2r8(rsq02);
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+            rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+            rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+            rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+            rinvsq01         = _fjsp_mul_v2r8(rinv01,rinv01);
+            rinvsq02         = _fjsp_mul_v2r8(rinv02,rinv02);
+            rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+            rinvsq11         = _fjsp_mul_v2r8(rinv11,rinv11);
+            rinvsq12         = _fjsp_mul_v2r8(rinv12,rinv12);
+            rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+            rinvsq21         = _fjsp_mul_v2r8(rinv21,rinv21);
+            rinvsq22         = _fjsp_mul_v2r8(rinv22,rinv22);
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+            fjx1             = _fjsp_setzero_v2r8();
+            fjy1             = _fjsp_setzero_v2r8();
+            fjz1             = _fjsp_setzero_v2r8();
+            fjx2             = _fjsp_setzero_v2r8();
+            fjy2             = _fjsp_setzero_v2r8();
+            fjz2             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq00,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq00,rinv00),crf));
+            felec            = _fjsp_mul_v2r8(qq00,_fjsp_msub_v2r8(rinv00,rinvsq00,krf2));
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+            vvdw6            = _fjsp_mul_v2r8(c6_00,rinvsix);
+            vvdw12           = _fjsp_mul_v2r8(c12_00,_fjsp_mul_v2r8(rinvsix,rinvsix));
+            vvdw             = _fjsp_msub_v2r8( vvdw12,one_twelfth, _fjsp_mul_v2r8(vvdw6,one_sixth) );
+            fvdw             = _fjsp_mul_v2r8(_fjsp_sub_v2r8(vvdw12,vvdw6),rinvsq00);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+            vvdw             = _fjsp_unpacklo_v2r8(vvdw,_fjsp_setzero_v2r8());
+            vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
+
+            fscal            = _fjsp_add_v2r8(felec,fvdw);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq01,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq01,rinv01),crf));
+            felec            = _fjsp_mul_v2r8(qq01,_fjsp_msub_v2r8(rinv01,rinvsq01,krf2));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx01,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy01,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz01,fscal,fiz0);
+            
+            fjx1             = _fjsp_madd_v2r8(dx01,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy01,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz01,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq02,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq02,rinv02),crf));
+            felec            = _fjsp_mul_v2r8(qq02,_fjsp_msub_v2r8(rinv02,rinvsq02,krf2));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx02,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy02,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz02,fscal,fiz0);
+            
+            fjx2             = _fjsp_madd_v2r8(dx02,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy02,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz02,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq10,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq10,rinv10),crf));
+            felec            = _fjsp_mul_v2r8(qq10,_fjsp_msub_v2r8(rinv10,rinvsq10,krf2));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq11,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq11,rinv11),crf));
+            felec            = _fjsp_mul_v2r8(qq11,_fjsp_msub_v2r8(rinv11,rinvsq11,krf2));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+            
+            fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq12,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq12,rinv12),crf));
+            felec            = _fjsp_mul_v2r8(qq12,_fjsp_msub_v2r8(rinv12,rinvsq12,krf2));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+            
+            fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq20,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq20,rinv20),crf));
+            felec            = _fjsp_mul_v2r8(qq20,_fjsp_msub_v2r8(rinv20,rinvsq20,krf2));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq21,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq21,rinv21),crf));
+            felec            = _fjsp_mul_v2r8(qq21,_fjsp_msub_v2r8(rinv21,rinvsq21,krf2));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+            
+            fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq22,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq22,rinv22),crf));
+            felec            = _fjsp_mul_v2r8(qq22,_fjsp_msub_v2r8(rinv22,rinvsq22,krf2));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+            
+            fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+
+            gmx_fjsp_decrement_3rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+
+            /* Inner loop uses 327 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_fjsp_update_iforce_3atom_swizzle_v2r8(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        ggid                        = gid[iidx];
+        /* Update potential energies */
+        gmx_fjsp_update_1pot_v2r8(velecsum,kernel_data->energygrp_elec+ggid);
+        gmx_fjsp_update_1pot_v2r8(vvdwsum,kernel_data->energygrp_vdw+ggid);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 20 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_VF,outeriter*20 + inneriter*327);
+}
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecRF_VdwLJ_GeomW3W3_F_sparc64_hpc_ace_double
+ * Electrostatics interaction: ReactionField
+ * VdW interaction:            LennardJones
+ * Geometry:                   Water3-Water3
+ * Calculate force/pot:        Force
+ */
+void
+nb_kernel_ElecRF_VdwLJ_GeomW3W3_F_sparc64_hpc_ace_double
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwjidx0A,vdwjidx0B;
+    _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    int              vdwjidx1A,vdwjidx1B;
+    _fjsp_v2r8       jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+    int              vdwjidx2A,vdwjidx2B;
+    _fjsp_v2r8       jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+    _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    _fjsp_v2r8       dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
+    _fjsp_v2r8       dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
+    _fjsp_v2r8       dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    _fjsp_v2r8       dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+    _fjsp_v2r8       dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+    _fjsp_v2r8       dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    _fjsp_v2r8       dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+    _fjsp_v2r8       dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+    _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
+    _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
+    _fjsp_v2r8       itab_tmp;
+    _fjsp_v2r8       dummy_mask,cutoff_mask;
+    _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+    _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+    union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    krf              = gmx_fjsp_set1_v2r8(fr->ic->k_rf);
+    krf2             = gmx_fjsp_set1_v2r8(fr->ic->k_rf*2.0);
+    crf              = gmx_fjsp_set1_v2r8(fr->ic->c_rf);
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq0              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+0]));
+    iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+    iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+    vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+    jq0              = gmx_fjsp_set1_v2r8(charge[inr+0]);
+    jq1              = gmx_fjsp_set1_v2r8(charge[inr+1]);
+    jq2              = gmx_fjsp_set1_v2r8(charge[inr+2]);
+    vdwjidx0A        = 2*vdwtype[inr+0];
+    qq00             = _fjsp_mul_v2r8(iq0,jq0);
+    c6_00            = gmx_fjsp_set1_v2r8(vdwparam[vdwioffset0+vdwjidx0A]);
+    c12_00           = gmx_fjsp_set1_v2r8(vdwparam[vdwioffset0+vdwjidx0A+1]);
+    qq01             = _fjsp_mul_v2r8(iq0,jq1);
+    qq02             = _fjsp_mul_v2r8(iq0,jq2);
+    qq10             = _fjsp_mul_v2r8(iq1,jq0);
+    qq11             = _fjsp_mul_v2r8(iq1,jq1);
+    qq12             = _fjsp_mul_v2r8(iq1,jq2);
+    qq20             = _fjsp_mul_v2r8(iq2,jq0);
+    qq21             = _fjsp_mul_v2r8(iq2,jq1);
+    qq22             = _fjsp_mul_v2r8(iq2,jq2);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_fjsp_load_shift_and_3rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
+
+        fix0             = _fjsp_setzero_v2r8();
+        fiy0             = _fjsp_setzero_v2r8();
+        fiz0             = _fjsp_setzero_v2r8();
+        fix1             = _fjsp_setzero_v2r8();
+        fiy1             = _fjsp_setzero_v2r8();
+        fiz1             = _fjsp_setzero_v2r8();
+        fix2             = _fjsp_setzero_v2r8();
+        fiy2             = _fjsp_setzero_v2r8();
+        fiz2             = _fjsp_setzero_v2r8();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_3rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx01             = _fjsp_sub_v2r8(ix0,jx1);
+            dy01             = _fjsp_sub_v2r8(iy0,jy1);
+            dz01             = _fjsp_sub_v2r8(iz0,jz1);
+            dx02             = _fjsp_sub_v2r8(ix0,jx2);
+            dy02             = _fjsp_sub_v2r8(iy0,jy2);
+            dz02             = _fjsp_sub_v2r8(iz0,jz2);
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx11             = _fjsp_sub_v2r8(ix1,jx1);
+            dy11             = _fjsp_sub_v2r8(iy1,jy1);
+            dz11             = _fjsp_sub_v2r8(iz1,jz1);
+            dx12             = _fjsp_sub_v2r8(ix1,jx2);
+            dy12             = _fjsp_sub_v2r8(iy1,jy2);
+            dz12             = _fjsp_sub_v2r8(iz1,jz2);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+            dx21             = _fjsp_sub_v2r8(ix2,jx1);
+            dy21             = _fjsp_sub_v2r8(iy2,jy1);
+            dz21             = _fjsp_sub_v2r8(iz2,jz1);
+            dx22             = _fjsp_sub_v2r8(ix2,jx2);
+            dy22             = _fjsp_sub_v2r8(iy2,jy2);
+            dz22             = _fjsp_sub_v2r8(iz2,jz2);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq01            = gmx_fjsp_calc_rsq_v2r8(dx01,dy01,dz01);
+            rsq02            = gmx_fjsp_calc_rsq_v2r8(dx02,dy02,dz02);
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+            rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+            rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+            rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv01           = gmx_fjsp_invsqrt_v2r8(rsq01);
+            rinv02           = gmx_fjsp_invsqrt_v2r8(rsq02);
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+            rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+            rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+            rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+            rinvsq01         = _fjsp_mul_v2r8(rinv01,rinv01);
+            rinvsq02         = _fjsp_mul_v2r8(rinv02,rinv02);
+            rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+            rinvsq11         = _fjsp_mul_v2r8(rinv11,rinv11);
+            rinvsq12         = _fjsp_mul_v2r8(rinv12,rinv12);
+            rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+            rinvsq21         = _fjsp_mul_v2r8(rinv21,rinv21);
+            rinvsq22         = _fjsp_mul_v2r8(rinv22,rinv22);
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+            fjx1             = _fjsp_setzero_v2r8();
+            fjy1             = _fjsp_setzero_v2r8();
+            fjz1             = _fjsp_setzero_v2r8();
+            fjx2             = _fjsp_setzero_v2r8();
+            fjy2             = _fjsp_setzero_v2r8();
+            fjz2             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq00,_fjsp_msub_v2r8(rinv00,rinvsq00,krf2));
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+            fvdw             = _fjsp_mul_v2r8(_fjsp_msub_v2r8(c12_00,rinvsix,c6_00),_fjsp_mul_v2r8(rinvsix,rinvsq00));
+
+            fscal            = _fjsp_add_v2r8(felec,fvdw);
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq01,_fjsp_msub_v2r8(rinv01,rinvsq01,krf2));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx01,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy01,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz01,fscal,fiz0);
+            
+            fjx1             = _fjsp_madd_v2r8(dx01,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy01,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz01,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq02,_fjsp_msub_v2r8(rinv02,rinvsq02,krf2));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx02,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy02,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz02,fscal,fiz0);
+            
+            fjx2             = _fjsp_madd_v2r8(dx02,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy02,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz02,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq10,_fjsp_msub_v2r8(rinv10,rinvsq10,krf2));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq11,_fjsp_msub_v2r8(rinv11,rinvsq11,krf2));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+            
+            fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq12,_fjsp_msub_v2r8(rinv12,rinvsq12,krf2));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+            
+            fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq20,_fjsp_msub_v2r8(rinv20,rinvsq20,krf2));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq21,_fjsp_msub_v2r8(rinv21,rinvsq21,krf2));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+            
+            fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq22,_fjsp_msub_v2r8(rinv22,rinvsq22,krf2));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+            
+            fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+
+            gmx_fjsp_decrement_3rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+
+            /* Inner loop uses 277 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_3rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx01             = _fjsp_sub_v2r8(ix0,jx1);
+            dy01             = _fjsp_sub_v2r8(iy0,jy1);
+            dz01             = _fjsp_sub_v2r8(iz0,jz1);
+            dx02             = _fjsp_sub_v2r8(ix0,jx2);
+            dy02             = _fjsp_sub_v2r8(iy0,jy2);
+            dz02             = _fjsp_sub_v2r8(iz0,jz2);
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx11             = _fjsp_sub_v2r8(ix1,jx1);
+            dy11             = _fjsp_sub_v2r8(iy1,jy1);
+            dz11             = _fjsp_sub_v2r8(iz1,jz1);
+            dx12             = _fjsp_sub_v2r8(ix1,jx2);
+            dy12             = _fjsp_sub_v2r8(iy1,jy2);
+            dz12             = _fjsp_sub_v2r8(iz1,jz2);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+            dx21             = _fjsp_sub_v2r8(ix2,jx1);
+            dy21             = _fjsp_sub_v2r8(iy2,jy1);
+            dz21             = _fjsp_sub_v2r8(iz2,jz1);
+            dx22             = _fjsp_sub_v2r8(ix2,jx2);
+            dy22             = _fjsp_sub_v2r8(iy2,jy2);
+            dz22             = _fjsp_sub_v2r8(iz2,jz2);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq01            = gmx_fjsp_calc_rsq_v2r8(dx01,dy01,dz01);
+            rsq02            = gmx_fjsp_calc_rsq_v2r8(dx02,dy02,dz02);
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+            rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+            rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+            rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv01           = gmx_fjsp_invsqrt_v2r8(rsq01);
+            rinv02           = gmx_fjsp_invsqrt_v2r8(rsq02);
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+            rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+            rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+            rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+            rinvsq01         = _fjsp_mul_v2r8(rinv01,rinv01);
+            rinvsq02         = _fjsp_mul_v2r8(rinv02,rinv02);
+            rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+            rinvsq11         = _fjsp_mul_v2r8(rinv11,rinv11);
+            rinvsq12         = _fjsp_mul_v2r8(rinv12,rinv12);
+            rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+            rinvsq21         = _fjsp_mul_v2r8(rinv21,rinv21);
+            rinvsq22         = _fjsp_mul_v2r8(rinv22,rinv22);
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+            fjx1             = _fjsp_setzero_v2r8();
+            fjy1             = _fjsp_setzero_v2r8();
+            fjz1             = _fjsp_setzero_v2r8();
+            fjx2             = _fjsp_setzero_v2r8();
+            fjy2             = _fjsp_setzero_v2r8();
+            fjz2             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq00,_fjsp_msub_v2r8(rinv00,rinvsq00,krf2));
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+            fvdw             = _fjsp_mul_v2r8(_fjsp_msub_v2r8(c12_00,rinvsix,c6_00),_fjsp_mul_v2r8(rinvsix,rinvsq00));
+
+            fscal            = _fjsp_add_v2r8(felec,fvdw);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq01,_fjsp_msub_v2r8(rinv01,rinvsq01,krf2));
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx01,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy01,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz01,fscal,fiz0);
+            
+            fjx1             = _fjsp_madd_v2r8(dx01,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy01,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz01,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq02,_fjsp_msub_v2r8(rinv02,rinvsq02,krf2));
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx02,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy02,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz02,fscal,fiz0);
+            
+            fjx2             = _fjsp_madd_v2r8(dx02,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy02,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz02,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq10,_fjsp_msub_v2r8(rinv10,rinvsq10,krf2));
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq11,_fjsp_msub_v2r8(rinv11,rinvsq11,krf2));
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+            
+            fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq12,_fjsp_msub_v2r8(rinv12,rinvsq12,krf2));
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+            
+            fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq20,_fjsp_msub_v2r8(rinv20,rinvsq20,krf2));
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq21,_fjsp_msub_v2r8(rinv21,rinvsq21,krf2));
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+            
+            fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq22,_fjsp_msub_v2r8(rinv22,rinvsq22,krf2));
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+            
+            fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+
+            gmx_fjsp_decrement_3rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+
+            /* Inner loop uses 277 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_fjsp_update_iforce_3atom_swizzle_v2r8(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 18 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_F,outeriter*18 + inneriter*277);
+}
diff --git a/src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecRF_VdwLJ_GeomW4P1_sparc64_hpc_ace_double.c b/src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecRF_VdwLJ_GeomW4P1_sparc64_hpc_ace_double.c
new file mode 100644 (file)
index 0000000..45cb100
--- /dev/null
@@ -0,0 +1,963 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2012, by the GROMACS development team, led by
+ * David van der Spoel, Berk Hess, Erik Lindahl, and including many
+ * others, as listed in the AUTHORS file in the top-level source
+ * directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*
+ * Note: this file was generated by the GROMACS sparc64_hpc_ace_double kernel generator.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "kernelutil_sparc64_hpc_ace_double.h"
+
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecRF_VdwLJ_GeomW4P1_VF_sparc64_hpc_ace_double
+ * Electrostatics interaction: ReactionField
+ * VdW interaction:            LennardJones
+ * Geometry:                   Water4-Particle
+ * Calculate force/pot:        PotentialAndForce
+ */
+void
+nb_kernel_ElecRF_VdwLJ_GeomW4P1_VF_sparc64_hpc_ace_double
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwioffset3;
+    _fjsp_v2r8       ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+    int              vdwjidx0A,vdwjidx0B;
+    _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    _fjsp_v2r8       dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    _fjsp_v2r8       dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    _fjsp_v2r8       dx30,dy30,dz30,rsq30,rinv30,rinvsq30,r30,qq30,c6_30,c12_30;
+    _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
+    _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
+    _fjsp_v2r8       itab_tmp;
+    _fjsp_v2r8       dummy_mask,cutoff_mask;
+    _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+    _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+    union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    krf              = gmx_fjsp_set1_v2r8(fr->ic->k_rf);
+    krf2             = gmx_fjsp_set1_v2r8(fr->ic->k_rf*2.0);
+    crf              = gmx_fjsp_set1_v2r8(fr->ic->c_rf);
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+    iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+    iq3              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+3]));
+    vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_fjsp_load_shift_and_4rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
+
+        fix0             = _fjsp_setzero_v2r8();
+        fiy0             = _fjsp_setzero_v2r8();
+        fiz0             = _fjsp_setzero_v2r8();
+        fix1             = _fjsp_setzero_v2r8();
+        fiy1             = _fjsp_setzero_v2r8();
+        fiz1             = _fjsp_setzero_v2r8();
+        fix2             = _fjsp_setzero_v2r8();
+        fiy2             = _fjsp_setzero_v2r8();
+        fiz2             = _fjsp_setzero_v2r8();
+        fix3             = _fjsp_setzero_v2r8();
+        fiy3             = _fjsp_setzero_v2r8();
+        fiz3             = _fjsp_setzero_v2r8();
+
+        /* Reset potential sums */
+        velecsum         = _fjsp_setzero_v2r8();
+        vvdwsum          = _fjsp_setzero_v2r8();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+            dx30             = _fjsp_sub_v2r8(ix3,jx0);
+            dy30             = _fjsp_sub_v2r8(iy3,jy0);
+            dz30             = _fjsp_sub_v2r8(iz3,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+            rsq30            = gmx_fjsp_calc_rsq_v2r8(dx30,dy30,dz30);
+
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+            rinv30           = gmx_fjsp_invsqrt_v2r8(rsq30);
+
+            rinvsq00         = gmx_fjsp_inv_v2r8(rsq00);
+            rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+            rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+            rinvsq30         = _fjsp_mul_v2r8(rinv30,rinv30);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_fjsp_load_2real_swizzle_v2r8(charge+jnrA+0,charge+jnrB+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            gmx_fjsp_load_2pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,&c6_00,&c12_00);
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+            vvdw6            = _fjsp_mul_v2r8(c6_00,rinvsix);
+            vvdw12           = _fjsp_mul_v2r8(c12_00,_fjsp_mul_v2r8(rinvsix,rinvsix));
+            vvdw             = _fjsp_msub_v2r8( vvdw12,one_twelfth, _fjsp_mul_v2r8(vvdw6,one_sixth) );
+            fvdw             = _fjsp_mul_v2r8(_fjsp_sub_v2r8(vvdw12,vvdw6),rinvsq00);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
+
+            fscal            = fvdw;
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _fjsp_mul_v2r8(iq1,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq10,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq10,rinv10),crf));
+            felec            = _fjsp_mul_v2r8(qq10,_fjsp_msub_v2r8(rinv10,rinvsq10,krf2));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _fjsp_mul_v2r8(iq2,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq20,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq20,rinv20),crf));
+            felec            = _fjsp_mul_v2r8(qq20,_fjsp_msub_v2r8(rinv20,rinvsq20,krf2));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq30             = _fjsp_mul_v2r8(iq3,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq30,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq30,rinv30),crf));
+            felec            = _fjsp_mul_v2r8(qq30,_fjsp_msub_v2r8(rinv30,rinvsq30,krf2));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx30,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy30,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz30,fscal,fiz3);
+            
+            fjx0             = _fjsp_madd_v2r8(dx30,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy30,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz30,fscal,fjz0);
+
+            gmx_fjsp_decrement_1rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 143 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+            dx30             = _fjsp_sub_v2r8(ix3,jx0);
+            dy30             = _fjsp_sub_v2r8(iy3,jy0);
+            dz30             = _fjsp_sub_v2r8(iz3,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+            rsq30            = gmx_fjsp_calc_rsq_v2r8(dx30,dy30,dz30);
+
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+            rinv30           = gmx_fjsp_invsqrt_v2r8(rsq30);
+
+            rinvsq00         = gmx_fjsp_inv_v2r8(rsq00);
+            rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+            rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+            rinvsq30         = _fjsp_mul_v2r8(rinv30,rinv30);
+
+            /* Load parameters for j particles */
+            jq0              = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),charge+jnrA+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            gmx_fjsp_load_1pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,&c6_00,&c12_00);
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+            vvdw6            = _fjsp_mul_v2r8(c6_00,rinvsix);
+            vvdw12           = _fjsp_mul_v2r8(c12_00,_fjsp_mul_v2r8(rinvsix,rinvsix));
+            vvdw             = _fjsp_msub_v2r8( vvdw12,one_twelfth, _fjsp_mul_v2r8(vvdw6,one_sixth) );
+            fvdw             = _fjsp_mul_v2r8(_fjsp_sub_v2r8(vvdw12,vvdw6),rinvsq00);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            vvdw             = _fjsp_unpacklo_v2r8(vvdw,_fjsp_setzero_v2r8());
+            vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
+
+            fscal            = fvdw;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _fjsp_mul_v2r8(iq1,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq10,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq10,rinv10),crf));
+            felec            = _fjsp_mul_v2r8(qq10,_fjsp_msub_v2r8(rinv10,rinvsq10,krf2));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _fjsp_mul_v2r8(iq2,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq20,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq20,rinv20),crf));
+            felec            = _fjsp_mul_v2r8(qq20,_fjsp_msub_v2r8(rinv20,rinvsq20,krf2));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq30             = _fjsp_mul_v2r8(iq3,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq30,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq30,rinv30),crf));
+            felec            = _fjsp_mul_v2r8(qq30,_fjsp_msub_v2r8(rinv30,rinvsq30,krf2));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx30,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy30,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz30,fscal,fiz3);
+            
+            fjx0             = _fjsp_madd_v2r8(dx30,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy30,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz30,fscal,fjz0);
+
+            gmx_fjsp_decrement_1rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 143 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_fjsp_update_iforce_4atom_swizzle_v2r8(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        ggid                        = gid[iidx];
+        /* Update potential energies */
+        gmx_fjsp_update_1pot_v2r8(velecsum,kernel_data->energygrp_elec+ggid);
+        gmx_fjsp_update_1pot_v2r8(vvdwsum,kernel_data->energygrp_vdw+ggid);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 26 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4_VF,outeriter*26 + inneriter*143);
+}
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecRF_VdwLJ_GeomW4P1_F_sparc64_hpc_ace_double
+ * Electrostatics interaction: ReactionField
+ * VdW interaction:            LennardJones
+ * Geometry:                   Water4-Particle
+ * Calculate force/pot:        Force
+ */
+void
+nb_kernel_ElecRF_VdwLJ_GeomW4P1_F_sparc64_hpc_ace_double
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwioffset3;
+    _fjsp_v2r8       ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+    int              vdwjidx0A,vdwjidx0B;
+    _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    _fjsp_v2r8       dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    _fjsp_v2r8       dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    _fjsp_v2r8       dx30,dy30,dz30,rsq30,rinv30,rinvsq30,r30,qq30,c6_30,c12_30;
+    _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
+    _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
+    _fjsp_v2r8       itab_tmp;
+    _fjsp_v2r8       dummy_mask,cutoff_mask;
+    _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+    _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+    union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    krf              = gmx_fjsp_set1_v2r8(fr->ic->k_rf);
+    krf2             = gmx_fjsp_set1_v2r8(fr->ic->k_rf*2.0);
+    crf              = gmx_fjsp_set1_v2r8(fr->ic->c_rf);
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+    iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+    iq3              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+3]));
+    vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_fjsp_load_shift_and_4rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
+
+        fix0             = _fjsp_setzero_v2r8();
+        fiy0             = _fjsp_setzero_v2r8();
+        fiz0             = _fjsp_setzero_v2r8();
+        fix1             = _fjsp_setzero_v2r8();
+        fiy1             = _fjsp_setzero_v2r8();
+        fiz1             = _fjsp_setzero_v2r8();
+        fix2             = _fjsp_setzero_v2r8();
+        fiy2             = _fjsp_setzero_v2r8();
+        fiz2             = _fjsp_setzero_v2r8();
+        fix3             = _fjsp_setzero_v2r8();
+        fiy3             = _fjsp_setzero_v2r8();
+        fiz3             = _fjsp_setzero_v2r8();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+            dx30             = _fjsp_sub_v2r8(ix3,jx0);
+            dy30             = _fjsp_sub_v2r8(iy3,jy0);
+            dz30             = _fjsp_sub_v2r8(iz3,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+            rsq30            = gmx_fjsp_calc_rsq_v2r8(dx30,dy30,dz30);
+
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+            rinv30           = gmx_fjsp_invsqrt_v2r8(rsq30);
+
+            rinvsq00         = gmx_fjsp_inv_v2r8(rsq00);
+            rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+            rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+            rinvsq30         = _fjsp_mul_v2r8(rinv30,rinv30);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_fjsp_load_2real_swizzle_v2r8(charge+jnrA+0,charge+jnrB+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            gmx_fjsp_load_2pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,&c6_00,&c12_00);
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+            fvdw             = _fjsp_mul_v2r8(_fjsp_msub_v2r8(c12_00,rinvsix,c6_00),_fjsp_mul_v2r8(rinvsix,rinvsq00));
+
+            fscal            = fvdw;
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _fjsp_mul_v2r8(iq1,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq10,_fjsp_msub_v2r8(rinv10,rinvsq10,krf2));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _fjsp_mul_v2r8(iq2,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq20,_fjsp_msub_v2r8(rinv20,rinvsq20,krf2));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq30             = _fjsp_mul_v2r8(iq3,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq30,_fjsp_msub_v2r8(rinv30,rinvsq30,krf2));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx30,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy30,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz30,fscal,fiz3);
+            
+            fjx0             = _fjsp_madd_v2r8(dx30,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy30,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz30,fscal,fjz0);
+
+            gmx_fjsp_decrement_1rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 123 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+            dx30             = _fjsp_sub_v2r8(ix3,jx0);
+            dy30             = _fjsp_sub_v2r8(iy3,jy0);
+            dz30             = _fjsp_sub_v2r8(iz3,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+            rsq30            = gmx_fjsp_calc_rsq_v2r8(dx30,dy30,dz30);
+
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+            rinv30           = gmx_fjsp_invsqrt_v2r8(rsq30);
+
+            rinvsq00         = gmx_fjsp_inv_v2r8(rsq00);
+            rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+            rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+            rinvsq30         = _fjsp_mul_v2r8(rinv30,rinv30);
+
+            /* Load parameters for j particles */
+            jq0              = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),charge+jnrA+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            gmx_fjsp_load_1pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,&c6_00,&c12_00);
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+            fvdw             = _fjsp_mul_v2r8(_fjsp_msub_v2r8(c12_00,rinvsix,c6_00),_fjsp_mul_v2r8(rinvsix,rinvsq00));
+
+            fscal            = fvdw;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _fjsp_mul_v2r8(iq1,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq10,_fjsp_msub_v2r8(rinv10,rinvsq10,krf2));
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _fjsp_mul_v2r8(iq2,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq20,_fjsp_msub_v2r8(rinv20,rinvsq20,krf2));
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq30             = _fjsp_mul_v2r8(iq3,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq30,_fjsp_msub_v2r8(rinv30,rinvsq30,krf2));
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx30,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy30,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz30,fscal,fiz3);
+            
+            fjx0             = _fjsp_madd_v2r8(dx30,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy30,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz30,fscal,fjz0);
+
+            gmx_fjsp_decrement_1rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 123 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_fjsp_update_iforce_4atom_swizzle_v2r8(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 24 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4_F,outeriter*24 + inneriter*123);
+}
diff --git a/src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecRF_VdwLJ_GeomW4W4_sparc64_hpc_ace_double.c b/src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecRF_VdwLJ_GeomW4W4_sparc64_hpc_ace_double.c
new file mode 100644 (file)
index 0000000..ae69a1d
--- /dev/null
@@ -0,0 +1,1645 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2012, by the GROMACS development team, led by
+ * David van der Spoel, Berk Hess, Erik Lindahl, and including many
+ * others, as listed in the AUTHORS file in the top-level source
+ * directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*
+ * Note: this file was generated by the GROMACS sparc64_hpc_ace_double kernel generator.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "kernelutil_sparc64_hpc_ace_double.h"
+
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecRF_VdwLJ_GeomW4W4_VF_sparc64_hpc_ace_double
+ * Electrostatics interaction: ReactionField
+ * VdW interaction:            LennardJones
+ * Geometry:                   Water4-Water4
+ * Calculate force/pot:        PotentialAndForce
+ */
+void
+nb_kernel_ElecRF_VdwLJ_GeomW4W4_VF_sparc64_hpc_ace_double
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwioffset3;
+    _fjsp_v2r8       ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+    int              vdwjidx0A,vdwjidx0B;
+    _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    int              vdwjidx1A,vdwjidx1B;
+    _fjsp_v2r8       jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+    int              vdwjidx2A,vdwjidx2B;
+    _fjsp_v2r8       jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+    int              vdwjidx3A,vdwjidx3B;
+    _fjsp_v2r8       jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
+    _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    _fjsp_v2r8       dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+    _fjsp_v2r8       dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+    _fjsp_v2r8       dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
+    _fjsp_v2r8       dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+    _fjsp_v2r8       dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+    _fjsp_v2r8       dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
+    _fjsp_v2r8       dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
+    _fjsp_v2r8       dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
+    _fjsp_v2r8       dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
+    _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
+    _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
+    _fjsp_v2r8       itab_tmp;
+    _fjsp_v2r8       dummy_mask,cutoff_mask;
+    _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+    _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+    union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    krf              = gmx_fjsp_set1_v2r8(fr->ic->k_rf);
+    krf2             = gmx_fjsp_set1_v2r8(fr->ic->k_rf*2.0);
+    crf              = gmx_fjsp_set1_v2r8(fr->ic->c_rf);
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+    iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+    iq3              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+3]));
+    vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+    jq1              = gmx_fjsp_set1_v2r8(charge[inr+1]);
+    jq2              = gmx_fjsp_set1_v2r8(charge[inr+2]);
+    jq3              = gmx_fjsp_set1_v2r8(charge[inr+3]);
+    vdwjidx0A        = 2*vdwtype[inr+0];
+    c6_00            = gmx_fjsp_set1_v2r8(vdwparam[vdwioffset0+vdwjidx0A]);
+    c12_00           = gmx_fjsp_set1_v2r8(vdwparam[vdwioffset0+vdwjidx0A+1]);
+    qq11             = _fjsp_mul_v2r8(iq1,jq1);
+    qq12             = _fjsp_mul_v2r8(iq1,jq2);
+    qq13             = _fjsp_mul_v2r8(iq1,jq3);
+    qq21             = _fjsp_mul_v2r8(iq2,jq1);
+    qq22             = _fjsp_mul_v2r8(iq2,jq2);
+    qq23             = _fjsp_mul_v2r8(iq2,jq3);
+    qq31             = _fjsp_mul_v2r8(iq3,jq1);
+    qq32             = _fjsp_mul_v2r8(iq3,jq2);
+    qq33             = _fjsp_mul_v2r8(iq3,jq3);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_fjsp_load_shift_and_4rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
+
+        fix0             = _fjsp_setzero_v2r8();
+        fiy0             = _fjsp_setzero_v2r8();
+        fiz0             = _fjsp_setzero_v2r8();
+        fix1             = _fjsp_setzero_v2r8();
+        fiy1             = _fjsp_setzero_v2r8();
+        fiz1             = _fjsp_setzero_v2r8();
+        fix2             = _fjsp_setzero_v2r8();
+        fiy2             = _fjsp_setzero_v2r8();
+        fiz2             = _fjsp_setzero_v2r8();
+        fix3             = _fjsp_setzero_v2r8();
+        fiy3             = _fjsp_setzero_v2r8();
+        fiz3             = _fjsp_setzero_v2r8();
+
+        /* Reset potential sums */
+        velecsum         = _fjsp_setzero_v2r8();
+        vvdwsum          = _fjsp_setzero_v2r8();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_4rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
+                                              &jy2,&jz2,&jx3,&jy3,&jz3);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx11             = _fjsp_sub_v2r8(ix1,jx1);
+            dy11             = _fjsp_sub_v2r8(iy1,jy1);
+            dz11             = _fjsp_sub_v2r8(iz1,jz1);
+            dx12             = _fjsp_sub_v2r8(ix1,jx2);
+            dy12             = _fjsp_sub_v2r8(iy1,jy2);
+            dz12             = _fjsp_sub_v2r8(iz1,jz2);
+            dx13             = _fjsp_sub_v2r8(ix1,jx3);
+            dy13             = _fjsp_sub_v2r8(iy1,jy3);
+            dz13             = _fjsp_sub_v2r8(iz1,jz3);
+            dx21             = _fjsp_sub_v2r8(ix2,jx1);
+            dy21             = _fjsp_sub_v2r8(iy2,jy1);
+            dz21             = _fjsp_sub_v2r8(iz2,jz1);
+            dx22             = _fjsp_sub_v2r8(ix2,jx2);
+            dy22             = _fjsp_sub_v2r8(iy2,jy2);
+            dz22             = _fjsp_sub_v2r8(iz2,jz2);
+            dx23             = _fjsp_sub_v2r8(ix2,jx3);
+            dy23             = _fjsp_sub_v2r8(iy2,jy3);
+            dz23             = _fjsp_sub_v2r8(iz2,jz3);
+            dx31             = _fjsp_sub_v2r8(ix3,jx1);
+            dy31             = _fjsp_sub_v2r8(iy3,jy1);
+            dz31             = _fjsp_sub_v2r8(iz3,jz1);
+            dx32             = _fjsp_sub_v2r8(ix3,jx2);
+            dy32             = _fjsp_sub_v2r8(iy3,jy2);
+            dz32             = _fjsp_sub_v2r8(iz3,jz2);
+            dx33             = _fjsp_sub_v2r8(ix3,jx3);
+            dy33             = _fjsp_sub_v2r8(iy3,jy3);
+            dz33             = _fjsp_sub_v2r8(iz3,jz3);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+            rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+            rsq13            = gmx_fjsp_calc_rsq_v2r8(dx13,dy13,dz13);
+            rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+            rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+            rsq23            = gmx_fjsp_calc_rsq_v2r8(dx23,dy23,dz23);
+            rsq31            = gmx_fjsp_calc_rsq_v2r8(dx31,dy31,dz31);
+            rsq32            = gmx_fjsp_calc_rsq_v2r8(dx32,dy32,dz32);
+            rsq33            = gmx_fjsp_calc_rsq_v2r8(dx33,dy33,dz33);
+
+            rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+            rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+            rinv13           = gmx_fjsp_invsqrt_v2r8(rsq13);
+            rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+            rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+            rinv23           = gmx_fjsp_invsqrt_v2r8(rsq23);
+            rinv31           = gmx_fjsp_invsqrt_v2r8(rsq31);
+            rinv32           = gmx_fjsp_invsqrt_v2r8(rsq32);
+            rinv33           = gmx_fjsp_invsqrt_v2r8(rsq33);
+
+            rinvsq00         = gmx_fjsp_inv_v2r8(rsq00);
+            rinvsq11         = _fjsp_mul_v2r8(rinv11,rinv11);
+            rinvsq12         = _fjsp_mul_v2r8(rinv12,rinv12);
+            rinvsq13         = _fjsp_mul_v2r8(rinv13,rinv13);
+            rinvsq21         = _fjsp_mul_v2r8(rinv21,rinv21);
+            rinvsq22         = _fjsp_mul_v2r8(rinv22,rinv22);
+            rinvsq23         = _fjsp_mul_v2r8(rinv23,rinv23);
+            rinvsq31         = _fjsp_mul_v2r8(rinv31,rinv31);
+            rinvsq32         = _fjsp_mul_v2r8(rinv32,rinv32);
+            rinvsq33         = _fjsp_mul_v2r8(rinv33,rinv33);
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+            fjx1             = _fjsp_setzero_v2r8();
+            fjy1             = _fjsp_setzero_v2r8();
+            fjz1             = _fjsp_setzero_v2r8();
+            fjx2             = _fjsp_setzero_v2r8();
+            fjy2             = _fjsp_setzero_v2r8();
+            fjz2             = _fjsp_setzero_v2r8();
+            fjx3             = _fjsp_setzero_v2r8();
+            fjy3             = _fjsp_setzero_v2r8();
+            fjz3             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+            vvdw6            = _fjsp_mul_v2r8(c6_00,rinvsix);
+            vvdw12           = _fjsp_mul_v2r8(c12_00,_fjsp_mul_v2r8(rinvsix,rinvsix));
+            vvdw             = _fjsp_msub_v2r8( vvdw12,one_twelfth, _fjsp_mul_v2r8(vvdw6,one_sixth) );
+            fvdw             = _fjsp_mul_v2r8(_fjsp_sub_v2r8(vvdw12,vvdw6),rinvsq00);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
+
+            fscal            = fvdw;
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq11,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq11,rinv11),crf));
+            felec            = _fjsp_mul_v2r8(qq11,_fjsp_msub_v2r8(rinv11,rinvsq11,krf2));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+            
+            fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq12,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq12,rinv12),crf));
+            felec            = _fjsp_mul_v2r8(qq12,_fjsp_msub_v2r8(rinv12,rinvsq12,krf2));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+            
+            fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq13,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq13,rinv13),crf));
+            felec            = _fjsp_mul_v2r8(qq13,_fjsp_msub_v2r8(rinv13,rinvsq13,krf2));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx13,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy13,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz13,fscal,fiz1);
+            
+            fjx3             = _fjsp_madd_v2r8(dx13,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy13,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz13,fscal,fjz3);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq21,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq21,rinv21),crf));
+            felec            = _fjsp_mul_v2r8(qq21,_fjsp_msub_v2r8(rinv21,rinvsq21,krf2));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+            
+            fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq22,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq22,rinv22),crf));
+            felec            = _fjsp_mul_v2r8(qq22,_fjsp_msub_v2r8(rinv22,rinvsq22,krf2));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+            
+            fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq23,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq23,rinv23),crf));
+            felec            = _fjsp_mul_v2r8(qq23,_fjsp_msub_v2r8(rinv23,rinvsq23,krf2));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx23,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy23,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz23,fscal,fiz2);
+            
+            fjx3             = _fjsp_madd_v2r8(dx23,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy23,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz23,fscal,fjz3);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq31,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq31,rinv31),crf));
+            felec            = _fjsp_mul_v2r8(qq31,_fjsp_msub_v2r8(rinv31,rinvsq31,krf2));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx31,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy31,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz31,fscal,fiz3);
+            
+            fjx1             = _fjsp_madd_v2r8(dx31,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy31,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz31,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq32,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq32,rinv32),crf));
+            felec            = _fjsp_mul_v2r8(qq32,_fjsp_msub_v2r8(rinv32,rinvsq32,krf2));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx32,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy32,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz32,fscal,fiz3);
+            
+            fjx2             = _fjsp_madd_v2r8(dx32,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy32,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz32,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq33,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq33,rinv33),crf));
+            felec            = _fjsp_mul_v2r8(qq33,_fjsp_msub_v2r8(rinv33,rinvsq33,krf2));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx33,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy33,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz33,fscal,fiz3);
+            
+            fjx3             = _fjsp_madd_v2r8(dx33,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy33,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz33,fscal,fjz3);
+
+            gmx_fjsp_decrement_4rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+
+            /* Inner loop uses 353 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_4rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
+                                              &jy2,&jz2,&jx3,&jy3,&jz3);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx11             = _fjsp_sub_v2r8(ix1,jx1);
+            dy11             = _fjsp_sub_v2r8(iy1,jy1);
+            dz11             = _fjsp_sub_v2r8(iz1,jz1);
+            dx12             = _fjsp_sub_v2r8(ix1,jx2);
+            dy12             = _fjsp_sub_v2r8(iy1,jy2);
+            dz12             = _fjsp_sub_v2r8(iz1,jz2);
+            dx13             = _fjsp_sub_v2r8(ix1,jx3);
+            dy13             = _fjsp_sub_v2r8(iy1,jy3);
+            dz13             = _fjsp_sub_v2r8(iz1,jz3);
+            dx21             = _fjsp_sub_v2r8(ix2,jx1);
+            dy21             = _fjsp_sub_v2r8(iy2,jy1);
+            dz21             = _fjsp_sub_v2r8(iz2,jz1);
+            dx22             = _fjsp_sub_v2r8(ix2,jx2);
+            dy22             = _fjsp_sub_v2r8(iy2,jy2);
+            dz22             = _fjsp_sub_v2r8(iz2,jz2);
+            dx23             = _fjsp_sub_v2r8(ix2,jx3);
+            dy23             = _fjsp_sub_v2r8(iy2,jy3);
+            dz23             = _fjsp_sub_v2r8(iz2,jz3);
+            dx31             = _fjsp_sub_v2r8(ix3,jx1);
+            dy31             = _fjsp_sub_v2r8(iy3,jy1);
+            dz31             = _fjsp_sub_v2r8(iz3,jz1);
+            dx32             = _fjsp_sub_v2r8(ix3,jx2);
+            dy32             = _fjsp_sub_v2r8(iy3,jy2);
+            dz32             = _fjsp_sub_v2r8(iz3,jz2);
+            dx33             = _fjsp_sub_v2r8(ix3,jx3);
+            dy33             = _fjsp_sub_v2r8(iy3,jy3);
+            dz33             = _fjsp_sub_v2r8(iz3,jz3);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+            rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+            rsq13            = gmx_fjsp_calc_rsq_v2r8(dx13,dy13,dz13);
+            rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+            rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+            rsq23            = gmx_fjsp_calc_rsq_v2r8(dx23,dy23,dz23);
+            rsq31            = gmx_fjsp_calc_rsq_v2r8(dx31,dy31,dz31);
+            rsq32            = gmx_fjsp_calc_rsq_v2r8(dx32,dy32,dz32);
+            rsq33            = gmx_fjsp_calc_rsq_v2r8(dx33,dy33,dz33);
+
+            rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+            rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+            rinv13           = gmx_fjsp_invsqrt_v2r8(rsq13);
+            rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+            rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+            rinv23           = gmx_fjsp_invsqrt_v2r8(rsq23);
+            rinv31           = gmx_fjsp_invsqrt_v2r8(rsq31);
+            rinv32           = gmx_fjsp_invsqrt_v2r8(rsq32);
+            rinv33           = gmx_fjsp_invsqrt_v2r8(rsq33);
+
+            rinvsq00         = gmx_fjsp_inv_v2r8(rsq00);
+            rinvsq11         = _fjsp_mul_v2r8(rinv11,rinv11);
+            rinvsq12         = _fjsp_mul_v2r8(rinv12,rinv12);
+            rinvsq13         = _fjsp_mul_v2r8(rinv13,rinv13);
+            rinvsq21         = _fjsp_mul_v2r8(rinv21,rinv21);
+            rinvsq22         = _fjsp_mul_v2r8(rinv22,rinv22);
+            rinvsq23         = _fjsp_mul_v2r8(rinv23,rinv23);
+            rinvsq31         = _fjsp_mul_v2r8(rinv31,rinv31);
+            rinvsq32         = _fjsp_mul_v2r8(rinv32,rinv32);
+            rinvsq33         = _fjsp_mul_v2r8(rinv33,rinv33);
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+            fjx1             = _fjsp_setzero_v2r8();
+            fjy1             = _fjsp_setzero_v2r8();
+            fjz1             = _fjsp_setzero_v2r8();
+            fjx2             = _fjsp_setzero_v2r8();
+            fjy2             = _fjsp_setzero_v2r8();
+            fjz2             = _fjsp_setzero_v2r8();
+            fjx3             = _fjsp_setzero_v2r8();
+            fjy3             = _fjsp_setzero_v2r8();
+            fjz3             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+            vvdw6            = _fjsp_mul_v2r8(c6_00,rinvsix);
+            vvdw12           = _fjsp_mul_v2r8(c12_00,_fjsp_mul_v2r8(rinvsix,rinvsix));
+            vvdw             = _fjsp_msub_v2r8( vvdw12,one_twelfth, _fjsp_mul_v2r8(vvdw6,one_sixth) );
+            fvdw             = _fjsp_mul_v2r8(_fjsp_sub_v2r8(vvdw12,vvdw6),rinvsq00);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            vvdw             = _fjsp_unpacklo_v2r8(vvdw,_fjsp_setzero_v2r8());
+            vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
+
+            fscal            = fvdw;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq11,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq11,rinv11),crf));
+            felec            = _fjsp_mul_v2r8(qq11,_fjsp_msub_v2r8(rinv11,rinvsq11,krf2));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+            
+            fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq12,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq12,rinv12),crf));
+            felec            = _fjsp_mul_v2r8(qq12,_fjsp_msub_v2r8(rinv12,rinvsq12,krf2));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+            
+            fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq13,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq13,rinv13),crf));
+            felec            = _fjsp_mul_v2r8(qq13,_fjsp_msub_v2r8(rinv13,rinvsq13,krf2));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx13,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy13,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz13,fscal,fiz1);
+            
+            fjx3             = _fjsp_madd_v2r8(dx13,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy13,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz13,fscal,fjz3);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq21,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq21,rinv21),crf));
+            felec            = _fjsp_mul_v2r8(qq21,_fjsp_msub_v2r8(rinv21,rinvsq21,krf2));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+            
+            fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq22,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq22,rinv22),crf));
+            felec            = _fjsp_mul_v2r8(qq22,_fjsp_msub_v2r8(rinv22,rinvsq22,krf2));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+            
+            fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq23,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq23,rinv23),crf));
+            felec            = _fjsp_mul_v2r8(qq23,_fjsp_msub_v2r8(rinv23,rinvsq23,krf2));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx23,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy23,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz23,fscal,fiz2);
+            
+            fjx3             = _fjsp_madd_v2r8(dx23,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy23,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz23,fscal,fjz3);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq31,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq31,rinv31),crf));
+            felec            = _fjsp_mul_v2r8(qq31,_fjsp_msub_v2r8(rinv31,rinvsq31,krf2));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx31,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy31,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz31,fscal,fiz3);
+            
+            fjx1             = _fjsp_madd_v2r8(dx31,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy31,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz31,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq32,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq32,rinv32),crf));
+            felec            = _fjsp_mul_v2r8(qq32,_fjsp_msub_v2r8(rinv32,rinvsq32,krf2));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx32,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy32,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz32,fscal,fiz3);
+            
+            fjx2             = _fjsp_madd_v2r8(dx32,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy32,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz32,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq33,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq33,rinv33),crf));
+            felec            = _fjsp_mul_v2r8(qq33,_fjsp_msub_v2r8(rinv33,rinvsq33,krf2));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx33,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy33,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz33,fscal,fiz3);
+            
+            fjx3             = _fjsp_madd_v2r8(dx33,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy33,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz33,fscal,fjz3);
+
+            gmx_fjsp_decrement_4rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+
+            /* Inner loop uses 353 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_fjsp_update_iforce_4atom_swizzle_v2r8(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        ggid                        = gid[iidx];
+        /* Update potential energies */
+        gmx_fjsp_update_1pot_v2r8(velecsum,kernel_data->energygrp_elec+ggid);
+        gmx_fjsp_update_1pot_v2r8(vvdwsum,kernel_data->energygrp_vdw+ggid);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 26 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_VF,outeriter*26 + inneriter*353);
+}
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecRF_VdwLJ_GeomW4W4_F_sparc64_hpc_ace_double
+ * Electrostatics interaction: ReactionField
+ * VdW interaction:            LennardJones
+ * Geometry:                   Water4-Water4
+ * Calculate force/pot:        Force
+ */
+void
+nb_kernel_ElecRF_VdwLJ_GeomW4W4_F_sparc64_hpc_ace_double
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwioffset3;
+    _fjsp_v2r8       ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+    int              vdwjidx0A,vdwjidx0B;
+    _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    int              vdwjidx1A,vdwjidx1B;
+    _fjsp_v2r8       jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+    int              vdwjidx2A,vdwjidx2B;
+    _fjsp_v2r8       jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+    int              vdwjidx3A,vdwjidx3B;
+    _fjsp_v2r8       jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
+    _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    _fjsp_v2r8       dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+    _fjsp_v2r8       dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+    _fjsp_v2r8       dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
+    _fjsp_v2r8       dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+    _fjsp_v2r8       dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+    _fjsp_v2r8       dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
+    _fjsp_v2r8       dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
+    _fjsp_v2r8       dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
+    _fjsp_v2r8       dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
+    _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
+    _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
+    _fjsp_v2r8       itab_tmp;
+    _fjsp_v2r8       dummy_mask,cutoff_mask;
+    _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+    _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+    union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    krf              = gmx_fjsp_set1_v2r8(fr->ic->k_rf);
+    krf2             = gmx_fjsp_set1_v2r8(fr->ic->k_rf*2.0);
+    crf              = gmx_fjsp_set1_v2r8(fr->ic->c_rf);
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+    iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+    iq3              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+3]));
+    vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+    jq1              = gmx_fjsp_set1_v2r8(charge[inr+1]);
+    jq2              = gmx_fjsp_set1_v2r8(charge[inr+2]);
+    jq3              = gmx_fjsp_set1_v2r8(charge[inr+3]);
+    vdwjidx0A        = 2*vdwtype[inr+0];
+    c6_00            = gmx_fjsp_set1_v2r8(vdwparam[vdwioffset0+vdwjidx0A]);
+    c12_00           = gmx_fjsp_set1_v2r8(vdwparam[vdwioffset0+vdwjidx0A+1]);
+    qq11             = _fjsp_mul_v2r8(iq1,jq1);
+    qq12             = _fjsp_mul_v2r8(iq1,jq2);
+    qq13             = _fjsp_mul_v2r8(iq1,jq3);
+    qq21             = _fjsp_mul_v2r8(iq2,jq1);
+    qq22             = _fjsp_mul_v2r8(iq2,jq2);
+    qq23             = _fjsp_mul_v2r8(iq2,jq3);
+    qq31             = _fjsp_mul_v2r8(iq3,jq1);
+    qq32             = _fjsp_mul_v2r8(iq3,jq2);
+    qq33             = _fjsp_mul_v2r8(iq3,jq3);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_fjsp_load_shift_and_4rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
+
+        fix0             = _fjsp_setzero_v2r8();
+        fiy0             = _fjsp_setzero_v2r8();
+        fiz0             = _fjsp_setzero_v2r8();
+        fix1             = _fjsp_setzero_v2r8();
+        fiy1             = _fjsp_setzero_v2r8();
+        fiz1             = _fjsp_setzero_v2r8();
+        fix2             = _fjsp_setzero_v2r8();
+        fiy2             = _fjsp_setzero_v2r8();
+        fiz2             = _fjsp_setzero_v2r8();
+        fix3             = _fjsp_setzero_v2r8();
+        fiy3             = _fjsp_setzero_v2r8();
+        fiz3             = _fjsp_setzero_v2r8();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_4rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
+                                              &jy2,&jz2,&jx3,&jy3,&jz3);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx11             = _fjsp_sub_v2r8(ix1,jx1);
+            dy11             = _fjsp_sub_v2r8(iy1,jy1);
+            dz11             = _fjsp_sub_v2r8(iz1,jz1);
+            dx12             = _fjsp_sub_v2r8(ix1,jx2);
+            dy12             = _fjsp_sub_v2r8(iy1,jy2);
+            dz12             = _fjsp_sub_v2r8(iz1,jz2);
+            dx13             = _fjsp_sub_v2r8(ix1,jx3);
+            dy13             = _fjsp_sub_v2r8(iy1,jy3);
+            dz13             = _fjsp_sub_v2r8(iz1,jz3);
+            dx21             = _fjsp_sub_v2r8(ix2,jx1);
+            dy21             = _fjsp_sub_v2r8(iy2,jy1);
+            dz21             = _fjsp_sub_v2r8(iz2,jz1);
+            dx22             = _fjsp_sub_v2r8(ix2,jx2);
+            dy22             = _fjsp_sub_v2r8(iy2,jy2);
+            dz22             = _fjsp_sub_v2r8(iz2,jz2);
+            dx23             = _fjsp_sub_v2r8(ix2,jx3);
+            dy23             = _fjsp_sub_v2r8(iy2,jy3);
+            dz23             = _fjsp_sub_v2r8(iz2,jz3);
+            dx31             = _fjsp_sub_v2r8(ix3,jx1);
+            dy31             = _fjsp_sub_v2r8(iy3,jy1);
+            dz31             = _fjsp_sub_v2r8(iz3,jz1);
+            dx32             = _fjsp_sub_v2r8(ix3,jx2);
+            dy32             = _fjsp_sub_v2r8(iy3,jy2);
+            dz32             = _fjsp_sub_v2r8(iz3,jz2);
+            dx33             = _fjsp_sub_v2r8(ix3,jx3);
+            dy33             = _fjsp_sub_v2r8(iy3,jy3);
+            dz33             = _fjsp_sub_v2r8(iz3,jz3);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+            rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+            rsq13            = gmx_fjsp_calc_rsq_v2r8(dx13,dy13,dz13);
+            rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+            rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+            rsq23            = gmx_fjsp_calc_rsq_v2r8(dx23,dy23,dz23);
+            rsq31            = gmx_fjsp_calc_rsq_v2r8(dx31,dy31,dz31);
+            rsq32            = gmx_fjsp_calc_rsq_v2r8(dx32,dy32,dz32);
+            rsq33            = gmx_fjsp_calc_rsq_v2r8(dx33,dy33,dz33);
+
+            rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+            rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+            rinv13           = gmx_fjsp_invsqrt_v2r8(rsq13);
+            rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+            rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+            rinv23           = gmx_fjsp_invsqrt_v2r8(rsq23);
+            rinv31           = gmx_fjsp_invsqrt_v2r8(rsq31);
+            rinv32           = gmx_fjsp_invsqrt_v2r8(rsq32);
+            rinv33           = gmx_fjsp_invsqrt_v2r8(rsq33);
+
+            rinvsq00         = gmx_fjsp_inv_v2r8(rsq00);
+            rinvsq11         = _fjsp_mul_v2r8(rinv11,rinv11);
+            rinvsq12         = _fjsp_mul_v2r8(rinv12,rinv12);
+            rinvsq13         = _fjsp_mul_v2r8(rinv13,rinv13);
+            rinvsq21         = _fjsp_mul_v2r8(rinv21,rinv21);
+            rinvsq22         = _fjsp_mul_v2r8(rinv22,rinv22);
+            rinvsq23         = _fjsp_mul_v2r8(rinv23,rinv23);
+            rinvsq31         = _fjsp_mul_v2r8(rinv31,rinv31);
+            rinvsq32         = _fjsp_mul_v2r8(rinv32,rinv32);
+            rinvsq33         = _fjsp_mul_v2r8(rinv33,rinv33);
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+            fjx1             = _fjsp_setzero_v2r8();
+            fjy1             = _fjsp_setzero_v2r8();
+            fjz1             = _fjsp_setzero_v2r8();
+            fjx2             = _fjsp_setzero_v2r8();
+            fjy2             = _fjsp_setzero_v2r8();
+            fjz2             = _fjsp_setzero_v2r8();
+            fjx3             = _fjsp_setzero_v2r8();
+            fjy3             = _fjsp_setzero_v2r8();
+            fjz3             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+            fvdw             = _fjsp_mul_v2r8(_fjsp_msub_v2r8(c12_00,rinvsix,c6_00),_fjsp_mul_v2r8(rinvsix,rinvsq00));
+
+            fscal            = fvdw;
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq11,_fjsp_msub_v2r8(rinv11,rinvsq11,krf2));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+            
+            fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq12,_fjsp_msub_v2r8(rinv12,rinvsq12,krf2));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+            
+            fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq13,_fjsp_msub_v2r8(rinv13,rinvsq13,krf2));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx13,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy13,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz13,fscal,fiz1);
+            
+            fjx3             = _fjsp_madd_v2r8(dx13,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy13,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz13,fscal,fjz3);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq21,_fjsp_msub_v2r8(rinv21,rinvsq21,krf2));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+            
+            fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq22,_fjsp_msub_v2r8(rinv22,rinvsq22,krf2));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+            
+            fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq23,_fjsp_msub_v2r8(rinv23,rinvsq23,krf2));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx23,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy23,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz23,fscal,fiz2);
+            
+            fjx3             = _fjsp_madd_v2r8(dx23,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy23,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz23,fscal,fjz3);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq31,_fjsp_msub_v2r8(rinv31,rinvsq31,krf2));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx31,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy31,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz31,fscal,fiz3);
+            
+            fjx1             = _fjsp_madd_v2r8(dx31,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy31,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz31,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq32,_fjsp_msub_v2r8(rinv32,rinvsq32,krf2));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx32,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy32,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz32,fscal,fiz3);
+            
+            fjx2             = _fjsp_madd_v2r8(dx32,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy32,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz32,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq33,_fjsp_msub_v2r8(rinv33,rinvsq33,krf2));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx33,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy33,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz33,fscal,fiz3);
+            
+            fjx3             = _fjsp_madd_v2r8(dx33,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy33,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz33,fscal,fjz3);
+
+            gmx_fjsp_decrement_4rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+
+            /* Inner loop uses 303 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_4rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
+                                              &jy2,&jz2,&jx3,&jy3,&jz3);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx11             = _fjsp_sub_v2r8(ix1,jx1);
+            dy11             = _fjsp_sub_v2r8(iy1,jy1);
+            dz11             = _fjsp_sub_v2r8(iz1,jz1);
+            dx12             = _fjsp_sub_v2r8(ix1,jx2);
+            dy12             = _fjsp_sub_v2r8(iy1,jy2);
+            dz12             = _fjsp_sub_v2r8(iz1,jz2);
+            dx13             = _fjsp_sub_v2r8(ix1,jx3);
+            dy13             = _fjsp_sub_v2r8(iy1,jy3);
+            dz13             = _fjsp_sub_v2r8(iz1,jz3);
+            dx21             = _fjsp_sub_v2r8(ix2,jx1);
+            dy21             = _fjsp_sub_v2r8(iy2,jy1);
+            dz21             = _fjsp_sub_v2r8(iz2,jz1);
+            dx22             = _fjsp_sub_v2r8(ix2,jx2);
+            dy22             = _fjsp_sub_v2r8(iy2,jy2);
+            dz22             = _fjsp_sub_v2r8(iz2,jz2);
+            dx23             = _fjsp_sub_v2r8(ix2,jx3);
+            dy23             = _fjsp_sub_v2r8(iy2,jy3);
+            dz23             = _fjsp_sub_v2r8(iz2,jz3);
+            dx31             = _fjsp_sub_v2r8(ix3,jx1);
+            dy31             = _fjsp_sub_v2r8(iy3,jy1);
+            dz31             = _fjsp_sub_v2r8(iz3,jz1);
+            dx32             = _fjsp_sub_v2r8(ix3,jx2);
+            dy32             = _fjsp_sub_v2r8(iy3,jy2);
+            dz32             = _fjsp_sub_v2r8(iz3,jz2);
+            dx33             = _fjsp_sub_v2r8(ix3,jx3);
+            dy33             = _fjsp_sub_v2r8(iy3,jy3);
+            dz33             = _fjsp_sub_v2r8(iz3,jz3);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+            rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+            rsq13            = gmx_fjsp_calc_rsq_v2r8(dx13,dy13,dz13);
+            rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+            rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+            rsq23            = gmx_fjsp_calc_rsq_v2r8(dx23,dy23,dz23);
+            rsq31            = gmx_fjsp_calc_rsq_v2r8(dx31,dy31,dz31);
+            rsq32            = gmx_fjsp_calc_rsq_v2r8(dx32,dy32,dz32);
+            rsq33            = gmx_fjsp_calc_rsq_v2r8(dx33,dy33,dz33);
+
+            rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+            rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+            rinv13           = gmx_fjsp_invsqrt_v2r8(rsq13);
+            rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+            rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+            rinv23           = gmx_fjsp_invsqrt_v2r8(rsq23);
+            rinv31           = gmx_fjsp_invsqrt_v2r8(rsq31);
+            rinv32           = gmx_fjsp_invsqrt_v2r8(rsq32);
+            rinv33           = gmx_fjsp_invsqrt_v2r8(rsq33);
+
+            rinvsq00         = gmx_fjsp_inv_v2r8(rsq00);
+            rinvsq11         = _fjsp_mul_v2r8(rinv11,rinv11);
+            rinvsq12         = _fjsp_mul_v2r8(rinv12,rinv12);
+            rinvsq13         = _fjsp_mul_v2r8(rinv13,rinv13);
+            rinvsq21         = _fjsp_mul_v2r8(rinv21,rinv21);
+            rinvsq22         = _fjsp_mul_v2r8(rinv22,rinv22);
+            rinvsq23         = _fjsp_mul_v2r8(rinv23,rinv23);
+            rinvsq31         = _fjsp_mul_v2r8(rinv31,rinv31);
+            rinvsq32         = _fjsp_mul_v2r8(rinv32,rinv32);
+            rinvsq33         = _fjsp_mul_v2r8(rinv33,rinv33);
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+            fjx1             = _fjsp_setzero_v2r8();
+            fjy1             = _fjsp_setzero_v2r8();
+            fjz1             = _fjsp_setzero_v2r8();
+            fjx2             = _fjsp_setzero_v2r8();
+            fjy2             = _fjsp_setzero_v2r8();
+            fjz2             = _fjsp_setzero_v2r8();
+            fjx3             = _fjsp_setzero_v2r8();
+            fjy3             = _fjsp_setzero_v2r8();
+            fjz3             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+            fvdw             = _fjsp_mul_v2r8(_fjsp_msub_v2r8(c12_00,rinvsix,c6_00),_fjsp_mul_v2r8(rinvsix,rinvsq00));
+
+            fscal            = fvdw;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq11,_fjsp_msub_v2r8(rinv11,rinvsq11,krf2));
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+            
+            fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq12,_fjsp_msub_v2r8(rinv12,rinvsq12,krf2));
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+            
+            fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq13,_fjsp_msub_v2r8(rinv13,rinvsq13,krf2));
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx13,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy13,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz13,fscal,fiz1);
+            
+            fjx3             = _fjsp_madd_v2r8(dx13,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy13,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz13,fscal,fjz3);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq21,_fjsp_msub_v2r8(rinv21,rinvsq21,krf2));
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+            
+            fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq22,_fjsp_msub_v2r8(rinv22,rinvsq22,krf2));
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+            
+            fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq23,_fjsp_msub_v2r8(rinv23,rinvsq23,krf2));
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx23,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy23,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz23,fscal,fiz2);
+            
+            fjx3             = _fjsp_madd_v2r8(dx23,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy23,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz23,fscal,fjz3);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq31,_fjsp_msub_v2r8(rinv31,rinvsq31,krf2));
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx31,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy31,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz31,fscal,fiz3);
+            
+            fjx1             = _fjsp_madd_v2r8(dx31,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy31,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz31,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq32,_fjsp_msub_v2r8(rinv32,rinvsq32,krf2));
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx32,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy32,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz32,fscal,fiz3);
+            
+            fjx2             = _fjsp_madd_v2r8(dx32,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy32,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz32,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq33,_fjsp_msub_v2r8(rinv33,rinvsq33,krf2));
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx33,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy33,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz33,fscal,fiz3);
+            
+            fjx3             = _fjsp_madd_v2r8(dx33,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy33,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz33,fscal,fjz3);
+
+            gmx_fjsp_decrement_4rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+
+            /* Inner loop uses 303 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_fjsp_update_iforce_4atom_swizzle_v2r8(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 24 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_F,outeriter*24 + inneriter*303);
+}
diff --git a/src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecRF_VdwNone_GeomP1P1_sparc64_hpc_ace_double.c b/src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecRF_VdwNone_GeomP1P1_sparc64_hpc_ace_double.c
new file mode 100644 (file)
index 0000000..7155768
--- /dev/null
@@ -0,0 +1,486 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2012, by the GROMACS development team, led by
+ * David van der Spoel, Berk Hess, Erik Lindahl, and including many
+ * others, as listed in the AUTHORS file in the top-level source
+ * directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*
+ * Note: this file was generated by the GROMACS sparc64_hpc_ace_double kernel generator.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "kernelutil_sparc64_hpc_ace_double.h"
+
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecRF_VdwNone_GeomP1P1_VF_sparc64_hpc_ace_double
+ * Electrostatics interaction: ReactionField
+ * VdW interaction:            None
+ * Geometry:                   Particle-Particle
+ * Calculate force/pot:        PotentialAndForce
+ */
+void
+nb_kernel_ElecRF_VdwNone_GeomP1P1_VF_sparc64_hpc_ace_double
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwjidx0A,vdwjidx0B;
+    _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    _fjsp_v2r8       itab_tmp;
+    _fjsp_v2r8       dummy_mask,cutoff_mask;
+    _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+    _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+    union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    krf              = gmx_fjsp_set1_v2r8(fr->ic->k_rf);
+    krf2             = gmx_fjsp_set1_v2r8(fr->ic->k_rf*2.0);
+    crf              = gmx_fjsp_set1_v2r8(fr->ic->c_rf);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_fjsp_load_shift_and_1rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,&ix0,&iy0,&iz0);
+
+        fix0             = _fjsp_setzero_v2r8();
+        fiy0             = _fjsp_setzero_v2r8();
+        fiz0             = _fjsp_setzero_v2r8();
+
+        /* Load parameters for i particles */
+        iq0              = _fjsp_mul_v2r8(facel,gmx_fjsp_load1_v2r8(charge+inr+0));
+
+        /* Reset potential sums */
+        velecsum         = _fjsp_setzero_v2r8();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_fjsp_load_2real_swizzle_v2r8(charge+jnrA+0,charge+jnrB+0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _fjsp_mul_v2r8(iq0,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq00,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq00,rinv00),crf));
+            felec            = _fjsp_mul_v2r8(qq00,_fjsp_msub_v2r8(rinv00,rinvsq00,krf2));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            gmx_fjsp_decrement_fma_1rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fscal,dx00,dy00,dz00);
+
+            /* Inner loop uses 35 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+
+            /* Load parameters for j particles */
+            jq0              = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),charge+jnrA+0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _fjsp_mul_v2r8(iq0,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq00,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq00,rinv00),crf));
+            felec            = _fjsp_mul_v2r8(qq00,_fjsp_msub_v2r8(rinv00,rinvsq00,krf2));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            gmx_fjsp_decrement_fma_1rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fscal,dx00,dy00,dz00);
+
+            /* Inner loop uses 35 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_fjsp_update_iforce_1atom_swizzle_v2r8(fix0,fiy0,fiz0,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        ggid                        = gid[iidx];
+        /* Update potential energies */
+        gmx_fjsp_update_1pot_v2r8(velecsum,kernel_data->energygrp_elec+ggid);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 8 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VF,outeriter*8 + inneriter*35);
+}
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecRF_VdwNone_GeomP1P1_F_sparc64_hpc_ace_double
+ * Electrostatics interaction: ReactionField
+ * VdW interaction:            None
+ * Geometry:                   Particle-Particle
+ * Calculate force/pot:        Force
+ */
+void
+nb_kernel_ElecRF_VdwNone_GeomP1P1_F_sparc64_hpc_ace_double
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwjidx0A,vdwjidx0B;
+    _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    _fjsp_v2r8       itab_tmp;
+    _fjsp_v2r8       dummy_mask,cutoff_mask;
+    _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+    _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+    union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    krf              = gmx_fjsp_set1_v2r8(fr->ic->k_rf);
+    krf2             = gmx_fjsp_set1_v2r8(fr->ic->k_rf*2.0);
+    crf              = gmx_fjsp_set1_v2r8(fr->ic->c_rf);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_fjsp_load_shift_and_1rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,&ix0,&iy0,&iz0);
+
+        fix0             = _fjsp_setzero_v2r8();
+        fiy0             = _fjsp_setzero_v2r8();
+        fiz0             = _fjsp_setzero_v2r8();
+
+        /* Load parameters for i particles */
+        iq0              = _fjsp_mul_v2r8(facel,gmx_fjsp_load1_v2r8(charge+inr+0));
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_fjsp_load_2real_swizzle_v2r8(charge+jnrA+0,charge+jnrB+0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _fjsp_mul_v2r8(iq0,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq00,_fjsp_msub_v2r8(rinv00,rinvsq00,krf2));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            gmx_fjsp_decrement_fma_1rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fscal,dx00,dy00,dz00);
+
+            /* Inner loop uses 30 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+
+            /* Load parameters for j particles */
+            jq0              = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),charge+jnrA+0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _fjsp_mul_v2r8(iq0,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq00,_fjsp_msub_v2r8(rinv00,rinvsq00,krf2));
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            gmx_fjsp_decrement_fma_1rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fscal,dx00,dy00,dz00);
+
+            /* Inner loop uses 30 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_fjsp_update_iforce_1atom_swizzle_v2r8(fix0,fiy0,fiz0,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 7 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_F,outeriter*7 + inneriter*30);
+}
diff --git a/src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecRF_VdwNone_GeomW3P1_sparc64_hpc_ace_double.c b/src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecRF_VdwNone_GeomW3P1_sparc64_hpc_ace_double.c
new file mode 100644 (file)
index 0000000..e57c2e1
--- /dev/null
@@ -0,0 +1,792 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2012, by the GROMACS development team, led by
+ * David van der Spoel, Berk Hess, Erik Lindahl, and including many
+ * others, as listed in the AUTHORS file in the top-level source
+ * directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*
+ * Note: this file was generated by the GROMACS sparc64_hpc_ace_double kernel generator.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "kernelutil_sparc64_hpc_ace_double.h"
+
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecRF_VdwNone_GeomW3P1_VF_sparc64_hpc_ace_double
+ * Electrostatics interaction: ReactionField
+ * VdW interaction:            None
+ * Geometry:                   Water3-Particle
+ * Calculate force/pot:        PotentialAndForce
+ */
+void
+nb_kernel_ElecRF_VdwNone_GeomW3P1_VF_sparc64_hpc_ace_double
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwjidx0A,vdwjidx0B;
+    _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    _fjsp_v2r8       dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    _fjsp_v2r8       dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    _fjsp_v2r8       itab_tmp;
+    _fjsp_v2r8       dummy_mask,cutoff_mask;
+    _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+    _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+    union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    krf              = gmx_fjsp_set1_v2r8(fr->ic->k_rf);
+    krf2             = gmx_fjsp_set1_v2r8(fr->ic->k_rf*2.0);
+    crf              = gmx_fjsp_set1_v2r8(fr->ic->c_rf);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq0              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+0]));
+    iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+    iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_fjsp_load_shift_and_3rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
+
+        fix0             = _fjsp_setzero_v2r8();
+        fiy0             = _fjsp_setzero_v2r8();
+        fiz0             = _fjsp_setzero_v2r8();
+        fix1             = _fjsp_setzero_v2r8();
+        fiy1             = _fjsp_setzero_v2r8();
+        fiz1             = _fjsp_setzero_v2r8();
+        fix2             = _fjsp_setzero_v2r8();
+        fiy2             = _fjsp_setzero_v2r8();
+        fiz2             = _fjsp_setzero_v2r8();
+
+        /* Reset potential sums */
+        velecsum         = _fjsp_setzero_v2r8();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+            rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+            rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_fjsp_load_2real_swizzle_v2r8(charge+jnrA+0,charge+jnrB+0);
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _fjsp_mul_v2r8(iq0,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq00,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq00,rinv00),crf));
+            felec            = _fjsp_mul_v2r8(qq00,_fjsp_msub_v2r8(rinv00,rinvsq00,krf2));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _fjsp_mul_v2r8(iq1,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq10,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq10,rinv10),crf));
+            felec            = _fjsp_mul_v2r8(qq10,_fjsp_msub_v2r8(rinv10,rinvsq10,krf2));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _fjsp_mul_v2r8(iq2,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq20,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq20,rinv20),crf));
+            felec            = _fjsp_mul_v2r8(qq20,_fjsp_msub_v2r8(rinv20,rinvsq20,krf2));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            gmx_fjsp_decrement_1rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 108 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+            rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+            rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+
+            /* Load parameters for j particles */
+            jq0              = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),charge+jnrA+0);
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _fjsp_mul_v2r8(iq0,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq00,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq00,rinv00),crf));
+            felec            = _fjsp_mul_v2r8(qq00,_fjsp_msub_v2r8(rinv00,rinvsq00,krf2));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _fjsp_mul_v2r8(iq1,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq10,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq10,rinv10),crf));
+            felec            = _fjsp_mul_v2r8(qq10,_fjsp_msub_v2r8(rinv10,rinvsq10,krf2));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _fjsp_mul_v2r8(iq2,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq20,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq20,rinv20),crf));
+            felec            = _fjsp_mul_v2r8(qq20,_fjsp_msub_v2r8(rinv20,rinvsq20,krf2));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            gmx_fjsp_decrement_1rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 108 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_fjsp_update_iforce_3atom_swizzle_v2r8(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        ggid                        = gid[iidx];
+        /* Update potential energies */
+        gmx_fjsp_update_1pot_v2r8(velecsum,kernel_data->energygrp_elec+ggid);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 19 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W3_VF,outeriter*19 + inneriter*108);
+}
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecRF_VdwNone_GeomW3P1_F_sparc64_hpc_ace_double
+ * Electrostatics interaction: ReactionField
+ * VdW interaction:            None
+ * Geometry:                   Water3-Particle
+ * Calculate force/pot:        Force
+ */
+void
+nb_kernel_ElecRF_VdwNone_GeomW3P1_F_sparc64_hpc_ace_double
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwjidx0A,vdwjidx0B;
+    _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    _fjsp_v2r8       dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    _fjsp_v2r8       dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    _fjsp_v2r8       itab_tmp;
+    _fjsp_v2r8       dummy_mask,cutoff_mask;
+    _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+    _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+    union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    krf              = gmx_fjsp_set1_v2r8(fr->ic->k_rf);
+    krf2             = gmx_fjsp_set1_v2r8(fr->ic->k_rf*2.0);
+    crf              = gmx_fjsp_set1_v2r8(fr->ic->c_rf);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq0              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+0]));
+    iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+    iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_fjsp_load_shift_and_3rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
+
+        fix0             = _fjsp_setzero_v2r8();
+        fiy0             = _fjsp_setzero_v2r8();
+        fiz0             = _fjsp_setzero_v2r8();
+        fix1             = _fjsp_setzero_v2r8();
+        fiy1             = _fjsp_setzero_v2r8();
+        fiz1             = _fjsp_setzero_v2r8();
+        fix2             = _fjsp_setzero_v2r8();
+        fiy2             = _fjsp_setzero_v2r8();
+        fiz2             = _fjsp_setzero_v2r8();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+            rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+            rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_fjsp_load_2real_swizzle_v2r8(charge+jnrA+0,charge+jnrB+0);
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _fjsp_mul_v2r8(iq0,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq00,_fjsp_msub_v2r8(rinv00,rinvsq00,krf2));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _fjsp_mul_v2r8(iq1,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq10,_fjsp_msub_v2r8(rinv10,rinvsq10,krf2));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _fjsp_mul_v2r8(iq2,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq20,_fjsp_msub_v2r8(rinv20,rinvsq20,krf2));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            gmx_fjsp_decrement_1rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 93 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+            rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+            rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+
+            /* Load parameters for j particles */
+            jq0              = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),charge+jnrA+0);
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _fjsp_mul_v2r8(iq0,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq00,_fjsp_msub_v2r8(rinv00,rinvsq00,krf2));
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _fjsp_mul_v2r8(iq1,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq10,_fjsp_msub_v2r8(rinv10,rinvsq10,krf2));
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _fjsp_mul_v2r8(iq2,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq20,_fjsp_msub_v2r8(rinv20,rinvsq20,krf2));
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            gmx_fjsp_decrement_1rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 93 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_fjsp_update_iforce_3atom_swizzle_v2r8(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 18 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W3_F,outeriter*18 + inneriter*93);
+}
diff --git a/src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecRF_VdwNone_GeomW3W3_sparc64_hpc_ace_double.c b/src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecRF_VdwNone_GeomW3W3_sparc64_hpc_ace_double.c
new file mode 100644 (file)
index 0000000..7b31dd4
--- /dev/null
@@ -0,0 +1,1468 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2012, by the GROMACS development team, led by
+ * David van der Spoel, Berk Hess, Erik Lindahl, and including many
+ * others, as listed in the AUTHORS file in the top-level source
+ * directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*
+ * Note: this file was generated by the GROMACS sparc64_hpc_ace_double kernel generator.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "kernelutil_sparc64_hpc_ace_double.h"
+
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecRF_VdwNone_GeomW3W3_VF_sparc64_hpc_ace_double
+ * Electrostatics interaction: ReactionField
+ * VdW interaction:            None
+ * Geometry:                   Water3-Water3
+ * Calculate force/pot:        PotentialAndForce
+ */
+void
+nb_kernel_ElecRF_VdwNone_GeomW3W3_VF_sparc64_hpc_ace_double
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwjidx0A,vdwjidx0B;
+    _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    int              vdwjidx1A,vdwjidx1B;
+    _fjsp_v2r8       jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+    int              vdwjidx2A,vdwjidx2B;
+    _fjsp_v2r8       jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+    _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    _fjsp_v2r8       dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
+    _fjsp_v2r8       dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
+    _fjsp_v2r8       dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    _fjsp_v2r8       dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+    _fjsp_v2r8       dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+    _fjsp_v2r8       dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    _fjsp_v2r8       dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+    _fjsp_v2r8       dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+    _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    _fjsp_v2r8       itab_tmp;
+    _fjsp_v2r8       dummy_mask,cutoff_mask;
+    _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+    _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+    union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    krf              = gmx_fjsp_set1_v2r8(fr->ic->k_rf);
+    krf2             = gmx_fjsp_set1_v2r8(fr->ic->k_rf*2.0);
+    crf              = gmx_fjsp_set1_v2r8(fr->ic->c_rf);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq0              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+0]));
+    iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+    iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+
+    jq0              = gmx_fjsp_set1_v2r8(charge[inr+0]);
+    jq1              = gmx_fjsp_set1_v2r8(charge[inr+1]);
+    jq2              = gmx_fjsp_set1_v2r8(charge[inr+2]);
+    qq00             = _fjsp_mul_v2r8(iq0,jq0);
+    qq01             = _fjsp_mul_v2r8(iq0,jq1);
+    qq02             = _fjsp_mul_v2r8(iq0,jq2);
+    qq10             = _fjsp_mul_v2r8(iq1,jq0);
+    qq11             = _fjsp_mul_v2r8(iq1,jq1);
+    qq12             = _fjsp_mul_v2r8(iq1,jq2);
+    qq20             = _fjsp_mul_v2r8(iq2,jq0);
+    qq21             = _fjsp_mul_v2r8(iq2,jq1);
+    qq22             = _fjsp_mul_v2r8(iq2,jq2);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_fjsp_load_shift_and_3rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
+
+        fix0             = _fjsp_setzero_v2r8();
+        fiy0             = _fjsp_setzero_v2r8();
+        fiz0             = _fjsp_setzero_v2r8();
+        fix1             = _fjsp_setzero_v2r8();
+        fiy1             = _fjsp_setzero_v2r8();
+        fiz1             = _fjsp_setzero_v2r8();
+        fix2             = _fjsp_setzero_v2r8();
+        fiy2             = _fjsp_setzero_v2r8();
+        fiz2             = _fjsp_setzero_v2r8();
+
+        /* Reset potential sums */
+        velecsum         = _fjsp_setzero_v2r8();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_3rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx01             = _fjsp_sub_v2r8(ix0,jx1);
+            dy01             = _fjsp_sub_v2r8(iy0,jy1);
+            dz01             = _fjsp_sub_v2r8(iz0,jz1);
+            dx02             = _fjsp_sub_v2r8(ix0,jx2);
+            dy02             = _fjsp_sub_v2r8(iy0,jy2);
+            dz02             = _fjsp_sub_v2r8(iz0,jz2);
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx11             = _fjsp_sub_v2r8(ix1,jx1);
+            dy11             = _fjsp_sub_v2r8(iy1,jy1);
+            dz11             = _fjsp_sub_v2r8(iz1,jz1);
+            dx12             = _fjsp_sub_v2r8(ix1,jx2);
+            dy12             = _fjsp_sub_v2r8(iy1,jy2);
+            dz12             = _fjsp_sub_v2r8(iz1,jz2);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+            dx21             = _fjsp_sub_v2r8(ix2,jx1);
+            dy21             = _fjsp_sub_v2r8(iy2,jy1);
+            dz21             = _fjsp_sub_v2r8(iz2,jz1);
+            dx22             = _fjsp_sub_v2r8(ix2,jx2);
+            dy22             = _fjsp_sub_v2r8(iy2,jy2);
+            dz22             = _fjsp_sub_v2r8(iz2,jz2);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq01            = gmx_fjsp_calc_rsq_v2r8(dx01,dy01,dz01);
+            rsq02            = gmx_fjsp_calc_rsq_v2r8(dx02,dy02,dz02);
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+            rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+            rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+            rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv01           = gmx_fjsp_invsqrt_v2r8(rsq01);
+            rinv02           = gmx_fjsp_invsqrt_v2r8(rsq02);
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+            rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+            rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+            rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+            rinvsq01         = _fjsp_mul_v2r8(rinv01,rinv01);
+            rinvsq02         = _fjsp_mul_v2r8(rinv02,rinv02);
+            rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+            rinvsq11         = _fjsp_mul_v2r8(rinv11,rinv11);
+            rinvsq12         = _fjsp_mul_v2r8(rinv12,rinv12);
+            rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+            rinvsq21         = _fjsp_mul_v2r8(rinv21,rinv21);
+            rinvsq22         = _fjsp_mul_v2r8(rinv22,rinv22);
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+            fjx1             = _fjsp_setzero_v2r8();
+            fjy1             = _fjsp_setzero_v2r8();
+            fjz1             = _fjsp_setzero_v2r8();
+            fjx2             = _fjsp_setzero_v2r8();
+            fjy2             = _fjsp_setzero_v2r8();
+            fjz2             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq00,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq00,rinv00),crf));
+            felec            = _fjsp_mul_v2r8(qq00,_fjsp_msub_v2r8(rinv00,rinvsq00,krf2));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq01,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq01,rinv01),crf));
+            felec            = _fjsp_mul_v2r8(qq01,_fjsp_msub_v2r8(rinv01,rinvsq01,krf2));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx01,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy01,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz01,fscal,fiz0);
+            
+            fjx1             = _fjsp_madd_v2r8(dx01,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy01,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz01,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq02,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq02,rinv02),crf));
+            felec            = _fjsp_mul_v2r8(qq02,_fjsp_msub_v2r8(rinv02,rinvsq02,krf2));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx02,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy02,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz02,fscal,fiz0);
+            
+            fjx2             = _fjsp_madd_v2r8(dx02,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy02,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz02,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq10,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq10,rinv10),crf));
+            felec            = _fjsp_mul_v2r8(qq10,_fjsp_msub_v2r8(rinv10,rinvsq10,krf2));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq11,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq11,rinv11),crf));
+            felec            = _fjsp_mul_v2r8(qq11,_fjsp_msub_v2r8(rinv11,rinvsq11,krf2));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+            
+            fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq12,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq12,rinv12),crf));
+            felec            = _fjsp_mul_v2r8(qq12,_fjsp_msub_v2r8(rinv12,rinvsq12,krf2));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+            
+            fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq20,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq20,rinv20),crf));
+            felec            = _fjsp_mul_v2r8(qq20,_fjsp_msub_v2r8(rinv20,rinvsq20,krf2));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq21,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq21,rinv21),crf));
+            felec            = _fjsp_mul_v2r8(qq21,_fjsp_msub_v2r8(rinv21,rinvsq21,krf2));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+            
+            fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq22,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq22,rinv22),crf));
+            felec            = _fjsp_mul_v2r8(qq22,_fjsp_msub_v2r8(rinv22,rinvsq22,krf2));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+            
+            fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+
+            gmx_fjsp_decrement_3rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+
+            /* Inner loop uses 315 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_3rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx01             = _fjsp_sub_v2r8(ix0,jx1);
+            dy01             = _fjsp_sub_v2r8(iy0,jy1);
+            dz01             = _fjsp_sub_v2r8(iz0,jz1);
+            dx02             = _fjsp_sub_v2r8(ix0,jx2);
+            dy02             = _fjsp_sub_v2r8(iy0,jy2);
+            dz02             = _fjsp_sub_v2r8(iz0,jz2);
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx11             = _fjsp_sub_v2r8(ix1,jx1);
+            dy11             = _fjsp_sub_v2r8(iy1,jy1);
+            dz11             = _fjsp_sub_v2r8(iz1,jz1);
+            dx12             = _fjsp_sub_v2r8(ix1,jx2);
+            dy12             = _fjsp_sub_v2r8(iy1,jy2);
+            dz12             = _fjsp_sub_v2r8(iz1,jz2);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+            dx21             = _fjsp_sub_v2r8(ix2,jx1);
+            dy21             = _fjsp_sub_v2r8(iy2,jy1);
+            dz21             = _fjsp_sub_v2r8(iz2,jz1);
+            dx22             = _fjsp_sub_v2r8(ix2,jx2);
+            dy22             = _fjsp_sub_v2r8(iy2,jy2);
+            dz22             = _fjsp_sub_v2r8(iz2,jz2);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq01            = gmx_fjsp_calc_rsq_v2r8(dx01,dy01,dz01);
+            rsq02            = gmx_fjsp_calc_rsq_v2r8(dx02,dy02,dz02);
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+            rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+            rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+            rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv01           = gmx_fjsp_invsqrt_v2r8(rsq01);
+            rinv02           = gmx_fjsp_invsqrt_v2r8(rsq02);
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+            rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+            rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+            rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+            rinvsq01         = _fjsp_mul_v2r8(rinv01,rinv01);
+            rinvsq02         = _fjsp_mul_v2r8(rinv02,rinv02);
+            rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+            rinvsq11         = _fjsp_mul_v2r8(rinv11,rinv11);
+            rinvsq12         = _fjsp_mul_v2r8(rinv12,rinv12);
+            rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+            rinvsq21         = _fjsp_mul_v2r8(rinv21,rinv21);
+            rinvsq22         = _fjsp_mul_v2r8(rinv22,rinv22);
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+            fjx1             = _fjsp_setzero_v2r8();
+            fjy1             = _fjsp_setzero_v2r8();
+            fjz1             = _fjsp_setzero_v2r8();
+            fjx2             = _fjsp_setzero_v2r8();
+            fjy2             = _fjsp_setzero_v2r8();
+            fjz2             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq00,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq00,rinv00),crf));
+            felec            = _fjsp_mul_v2r8(qq00,_fjsp_msub_v2r8(rinv00,rinvsq00,krf2));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq01,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq01,rinv01),crf));
+            felec            = _fjsp_mul_v2r8(qq01,_fjsp_msub_v2r8(rinv01,rinvsq01,krf2));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx01,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy01,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz01,fscal,fiz0);
+            
+            fjx1             = _fjsp_madd_v2r8(dx01,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy01,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz01,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq02,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq02,rinv02),crf));
+            felec            = _fjsp_mul_v2r8(qq02,_fjsp_msub_v2r8(rinv02,rinvsq02,krf2));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx02,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy02,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz02,fscal,fiz0);
+            
+            fjx2             = _fjsp_madd_v2r8(dx02,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy02,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz02,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq10,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq10,rinv10),crf));
+            felec            = _fjsp_mul_v2r8(qq10,_fjsp_msub_v2r8(rinv10,rinvsq10,krf2));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq11,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq11,rinv11),crf));
+            felec            = _fjsp_mul_v2r8(qq11,_fjsp_msub_v2r8(rinv11,rinvsq11,krf2));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+            
+            fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq12,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq12,rinv12),crf));
+            felec            = _fjsp_mul_v2r8(qq12,_fjsp_msub_v2r8(rinv12,rinvsq12,krf2));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+            
+            fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq20,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq20,rinv20),crf));
+            felec            = _fjsp_mul_v2r8(qq20,_fjsp_msub_v2r8(rinv20,rinvsq20,krf2));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq21,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq21,rinv21),crf));
+            felec            = _fjsp_mul_v2r8(qq21,_fjsp_msub_v2r8(rinv21,rinvsq21,krf2));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+            
+            fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq22,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq22,rinv22),crf));
+            felec            = _fjsp_mul_v2r8(qq22,_fjsp_msub_v2r8(rinv22,rinvsq22,krf2));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+            
+            fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+
+            gmx_fjsp_decrement_3rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+
+            /* Inner loop uses 315 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_fjsp_update_iforce_3atom_swizzle_v2r8(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        ggid                        = gid[iidx];
+        /* Update potential energies */
+        gmx_fjsp_update_1pot_v2r8(velecsum,kernel_data->energygrp_elec+ggid);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 19 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W3W3_VF,outeriter*19 + inneriter*315);
+}
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecRF_VdwNone_GeomW3W3_F_sparc64_hpc_ace_double
+ * Electrostatics interaction: ReactionField
+ * VdW interaction:            None
+ * Geometry:                   Water3-Water3
+ * Calculate force/pot:        Force
+ */
+void
+nb_kernel_ElecRF_VdwNone_GeomW3W3_F_sparc64_hpc_ace_double
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwjidx0A,vdwjidx0B;
+    _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    int              vdwjidx1A,vdwjidx1B;
+    _fjsp_v2r8       jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+    int              vdwjidx2A,vdwjidx2B;
+    _fjsp_v2r8       jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+    _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    _fjsp_v2r8       dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
+    _fjsp_v2r8       dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
+    _fjsp_v2r8       dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    _fjsp_v2r8       dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+    _fjsp_v2r8       dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+    _fjsp_v2r8       dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    _fjsp_v2r8       dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+    _fjsp_v2r8       dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+    _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    _fjsp_v2r8       itab_tmp;
+    _fjsp_v2r8       dummy_mask,cutoff_mask;
+    _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+    _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+    union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    krf              = gmx_fjsp_set1_v2r8(fr->ic->k_rf);
+    krf2             = gmx_fjsp_set1_v2r8(fr->ic->k_rf*2.0);
+    crf              = gmx_fjsp_set1_v2r8(fr->ic->c_rf);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq0              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+0]));
+    iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+    iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+
+    jq0              = gmx_fjsp_set1_v2r8(charge[inr+0]);
+    jq1              = gmx_fjsp_set1_v2r8(charge[inr+1]);
+    jq2              = gmx_fjsp_set1_v2r8(charge[inr+2]);
+    qq00             = _fjsp_mul_v2r8(iq0,jq0);
+    qq01             = _fjsp_mul_v2r8(iq0,jq1);
+    qq02             = _fjsp_mul_v2r8(iq0,jq2);
+    qq10             = _fjsp_mul_v2r8(iq1,jq0);
+    qq11             = _fjsp_mul_v2r8(iq1,jq1);
+    qq12             = _fjsp_mul_v2r8(iq1,jq2);
+    qq20             = _fjsp_mul_v2r8(iq2,jq0);
+    qq21             = _fjsp_mul_v2r8(iq2,jq1);
+    qq22             = _fjsp_mul_v2r8(iq2,jq2);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_fjsp_load_shift_and_3rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
+
+        fix0             = _fjsp_setzero_v2r8();
+        fiy0             = _fjsp_setzero_v2r8();
+        fiz0             = _fjsp_setzero_v2r8();
+        fix1             = _fjsp_setzero_v2r8();
+        fiy1             = _fjsp_setzero_v2r8();
+        fiz1             = _fjsp_setzero_v2r8();
+        fix2             = _fjsp_setzero_v2r8();
+        fiy2             = _fjsp_setzero_v2r8();
+        fiz2             = _fjsp_setzero_v2r8();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_3rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx01             = _fjsp_sub_v2r8(ix0,jx1);
+            dy01             = _fjsp_sub_v2r8(iy0,jy1);
+            dz01             = _fjsp_sub_v2r8(iz0,jz1);
+            dx02             = _fjsp_sub_v2r8(ix0,jx2);
+            dy02             = _fjsp_sub_v2r8(iy0,jy2);
+            dz02             = _fjsp_sub_v2r8(iz0,jz2);
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx11             = _fjsp_sub_v2r8(ix1,jx1);
+            dy11             = _fjsp_sub_v2r8(iy1,jy1);
+            dz11             = _fjsp_sub_v2r8(iz1,jz1);
+            dx12             = _fjsp_sub_v2r8(ix1,jx2);
+            dy12             = _fjsp_sub_v2r8(iy1,jy2);
+            dz12             = _fjsp_sub_v2r8(iz1,jz2);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+            dx21             = _fjsp_sub_v2r8(ix2,jx1);
+            dy21             = _fjsp_sub_v2r8(iy2,jy1);
+            dz21             = _fjsp_sub_v2r8(iz2,jz1);
+            dx22             = _fjsp_sub_v2r8(ix2,jx2);
+            dy22             = _fjsp_sub_v2r8(iy2,jy2);
+            dz22             = _fjsp_sub_v2r8(iz2,jz2);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq01            = gmx_fjsp_calc_rsq_v2r8(dx01,dy01,dz01);
+            rsq02            = gmx_fjsp_calc_rsq_v2r8(dx02,dy02,dz02);
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+            rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+            rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+            rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv01           = gmx_fjsp_invsqrt_v2r8(rsq01);
+            rinv02           = gmx_fjsp_invsqrt_v2r8(rsq02);
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+            rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+            rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+            rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+            rinvsq01         = _fjsp_mul_v2r8(rinv01,rinv01);
+            rinvsq02         = _fjsp_mul_v2r8(rinv02,rinv02);
+            rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+            rinvsq11         = _fjsp_mul_v2r8(rinv11,rinv11);
+            rinvsq12         = _fjsp_mul_v2r8(rinv12,rinv12);
+            rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+            rinvsq21         = _fjsp_mul_v2r8(rinv21,rinv21);
+            rinvsq22         = _fjsp_mul_v2r8(rinv22,rinv22);
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+            fjx1             = _fjsp_setzero_v2r8();
+            fjy1             = _fjsp_setzero_v2r8();
+            fjz1             = _fjsp_setzero_v2r8();
+            fjx2             = _fjsp_setzero_v2r8();
+            fjy2             = _fjsp_setzero_v2r8();
+            fjz2             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq00,_fjsp_msub_v2r8(rinv00,rinvsq00,krf2));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq01,_fjsp_msub_v2r8(rinv01,rinvsq01,krf2));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx01,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy01,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz01,fscal,fiz0);
+            
+            fjx1             = _fjsp_madd_v2r8(dx01,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy01,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz01,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq02,_fjsp_msub_v2r8(rinv02,rinvsq02,krf2));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx02,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy02,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz02,fscal,fiz0);
+            
+            fjx2             = _fjsp_madd_v2r8(dx02,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy02,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz02,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq10,_fjsp_msub_v2r8(rinv10,rinvsq10,krf2));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq11,_fjsp_msub_v2r8(rinv11,rinvsq11,krf2));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+            
+            fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq12,_fjsp_msub_v2r8(rinv12,rinvsq12,krf2));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+            
+            fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq20,_fjsp_msub_v2r8(rinv20,rinvsq20,krf2));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq21,_fjsp_msub_v2r8(rinv21,rinvsq21,krf2));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+            
+            fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq22,_fjsp_msub_v2r8(rinv22,rinvsq22,krf2));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+            
+            fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+
+            gmx_fjsp_decrement_3rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+
+            /* Inner loop uses 270 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_3rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx01             = _fjsp_sub_v2r8(ix0,jx1);
+            dy01             = _fjsp_sub_v2r8(iy0,jy1);
+            dz01             = _fjsp_sub_v2r8(iz0,jz1);
+            dx02             = _fjsp_sub_v2r8(ix0,jx2);
+            dy02             = _fjsp_sub_v2r8(iy0,jy2);
+            dz02             = _fjsp_sub_v2r8(iz0,jz2);
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx11             = _fjsp_sub_v2r8(ix1,jx1);
+            dy11             = _fjsp_sub_v2r8(iy1,jy1);
+            dz11             = _fjsp_sub_v2r8(iz1,jz1);
+            dx12             = _fjsp_sub_v2r8(ix1,jx2);
+            dy12             = _fjsp_sub_v2r8(iy1,jy2);
+            dz12             = _fjsp_sub_v2r8(iz1,jz2);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+            dx21             = _fjsp_sub_v2r8(ix2,jx1);
+            dy21             = _fjsp_sub_v2r8(iy2,jy1);
+            dz21             = _fjsp_sub_v2r8(iz2,jz1);
+            dx22             = _fjsp_sub_v2r8(ix2,jx2);
+            dy22             = _fjsp_sub_v2r8(iy2,jy2);
+            dz22             = _fjsp_sub_v2r8(iz2,jz2);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq01            = gmx_fjsp_calc_rsq_v2r8(dx01,dy01,dz01);
+            rsq02            = gmx_fjsp_calc_rsq_v2r8(dx02,dy02,dz02);
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+            rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+            rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+            rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv01           = gmx_fjsp_invsqrt_v2r8(rsq01);
+            rinv02           = gmx_fjsp_invsqrt_v2r8(rsq02);
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+            rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+            rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+            rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+            rinvsq01         = _fjsp_mul_v2r8(rinv01,rinv01);
+            rinvsq02         = _fjsp_mul_v2r8(rinv02,rinv02);
+            rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+            rinvsq11         = _fjsp_mul_v2r8(rinv11,rinv11);
+            rinvsq12         = _fjsp_mul_v2r8(rinv12,rinv12);
+            rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+            rinvsq21         = _fjsp_mul_v2r8(rinv21,rinv21);
+            rinvsq22         = _fjsp_mul_v2r8(rinv22,rinv22);
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+            fjx1             = _fjsp_setzero_v2r8();
+            fjy1             = _fjsp_setzero_v2r8();
+            fjz1             = _fjsp_setzero_v2r8();
+            fjx2             = _fjsp_setzero_v2r8();
+            fjy2             = _fjsp_setzero_v2r8();
+            fjz2             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq00,_fjsp_msub_v2r8(rinv00,rinvsq00,krf2));
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq01,_fjsp_msub_v2r8(rinv01,rinvsq01,krf2));
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx01,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy01,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz01,fscal,fiz0);
+            
+            fjx1             = _fjsp_madd_v2r8(dx01,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy01,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz01,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq02,_fjsp_msub_v2r8(rinv02,rinvsq02,krf2));
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx02,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy02,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz02,fscal,fiz0);
+            
+            fjx2             = _fjsp_madd_v2r8(dx02,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy02,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz02,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq10,_fjsp_msub_v2r8(rinv10,rinvsq10,krf2));
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq11,_fjsp_msub_v2r8(rinv11,rinvsq11,krf2));
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+            
+            fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq12,_fjsp_msub_v2r8(rinv12,rinvsq12,krf2));
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+            
+            fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq20,_fjsp_msub_v2r8(rinv20,rinvsq20,krf2));
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq21,_fjsp_msub_v2r8(rinv21,rinvsq21,krf2));
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+            
+            fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq22,_fjsp_msub_v2r8(rinv22,rinvsq22,krf2));
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+            
+            fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+
+            gmx_fjsp_decrement_3rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+
+            /* Inner loop uses 270 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_fjsp_update_iforce_3atom_swizzle_v2r8(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 18 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W3W3_F,outeriter*18 + inneriter*270);
+}
diff --git a/src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecRF_VdwNone_GeomW4P1_sparc64_hpc_ace_double.c b/src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecRF_VdwNone_GeomW4P1_sparc64_hpc_ace_double.c
new file mode 100644 (file)
index 0000000..89c9e8c
--- /dev/null
@@ -0,0 +1,792 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2012, by the GROMACS development team, led by
+ * David van der Spoel, Berk Hess, Erik Lindahl, and including many
+ * others, as listed in the AUTHORS file in the top-level source
+ * directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*
+ * Note: this file was generated by the GROMACS sparc64_hpc_ace_double kernel generator.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "kernelutil_sparc64_hpc_ace_double.h"
+
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecRF_VdwNone_GeomW4P1_VF_sparc64_hpc_ace_double
+ * Electrostatics interaction: ReactionField
+ * VdW interaction:            None
+ * Geometry:                   Water4-Particle
+ * Calculate force/pot:        PotentialAndForce
+ */
+void
+nb_kernel_ElecRF_VdwNone_GeomW4P1_VF_sparc64_hpc_ace_double
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset1;
+    _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwioffset3;
+    _fjsp_v2r8       ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+    int              vdwjidx0A,vdwjidx0B;
+    _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    _fjsp_v2r8       dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    _fjsp_v2r8       dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    _fjsp_v2r8       dx30,dy30,dz30,rsq30,rinv30,rinvsq30,r30,qq30,c6_30,c12_30;
+    _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    _fjsp_v2r8       itab_tmp;
+    _fjsp_v2r8       dummy_mask,cutoff_mask;
+    _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+    _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+    union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    krf              = gmx_fjsp_set1_v2r8(fr->ic->k_rf);
+    krf2             = gmx_fjsp_set1_v2r8(fr->ic->k_rf*2.0);
+    crf              = gmx_fjsp_set1_v2r8(fr->ic->c_rf);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+    iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+    iq3              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+3]));
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_fjsp_load_shift_and_3rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset+DIM,
+                                                 &ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
+
+        fix1             = _fjsp_setzero_v2r8();
+        fiy1             = _fjsp_setzero_v2r8();
+        fiz1             = _fjsp_setzero_v2r8();
+        fix2             = _fjsp_setzero_v2r8();
+        fiy2             = _fjsp_setzero_v2r8();
+        fiz2             = _fjsp_setzero_v2r8();
+        fix3             = _fjsp_setzero_v2r8();
+        fiy3             = _fjsp_setzero_v2r8();
+        fiz3             = _fjsp_setzero_v2r8();
+
+        /* Reset potential sums */
+        velecsum         = _fjsp_setzero_v2r8();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+            dx30             = _fjsp_sub_v2r8(ix3,jx0);
+            dy30             = _fjsp_sub_v2r8(iy3,jy0);
+            dz30             = _fjsp_sub_v2r8(iz3,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+            rsq30            = gmx_fjsp_calc_rsq_v2r8(dx30,dy30,dz30);
+
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+            rinv30           = gmx_fjsp_invsqrt_v2r8(rsq30);
+
+            rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+            rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+            rinvsq30         = _fjsp_mul_v2r8(rinv30,rinv30);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_fjsp_load_2real_swizzle_v2r8(charge+jnrA+0,charge+jnrB+0);
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _fjsp_mul_v2r8(iq1,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq10,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq10,rinv10),crf));
+            felec            = _fjsp_mul_v2r8(qq10,_fjsp_msub_v2r8(rinv10,rinvsq10,krf2));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _fjsp_mul_v2r8(iq2,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq20,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq20,rinv20),crf));
+            felec            = _fjsp_mul_v2r8(qq20,_fjsp_msub_v2r8(rinv20,rinvsq20,krf2));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq30             = _fjsp_mul_v2r8(iq3,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq30,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq30,rinv30),crf));
+            felec            = _fjsp_mul_v2r8(qq30,_fjsp_msub_v2r8(rinv30,rinvsq30,krf2));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx30,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy30,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz30,fscal,fiz3);
+            
+            fjx0             = _fjsp_madd_v2r8(dx30,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy30,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz30,fscal,fjz0);
+
+            gmx_fjsp_decrement_1rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 108 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+            dx30             = _fjsp_sub_v2r8(ix3,jx0);
+            dy30             = _fjsp_sub_v2r8(iy3,jy0);
+            dz30             = _fjsp_sub_v2r8(iz3,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+            rsq30            = gmx_fjsp_calc_rsq_v2r8(dx30,dy30,dz30);
+
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+            rinv30           = gmx_fjsp_invsqrt_v2r8(rsq30);
+
+            rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+            rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+            rinvsq30         = _fjsp_mul_v2r8(rinv30,rinv30);
+
+            /* Load parameters for j particles */
+            jq0              = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),charge+jnrA+0);
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _fjsp_mul_v2r8(iq1,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq10,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq10,rinv10),crf));
+            felec            = _fjsp_mul_v2r8(qq10,_fjsp_msub_v2r8(rinv10,rinvsq10,krf2));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _fjsp_mul_v2r8(iq2,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq20,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq20,rinv20),crf));
+            felec            = _fjsp_mul_v2r8(qq20,_fjsp_msub_v2r8(rinv20,rinvsq20,krf2));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq30             = _fjsp_mul_v2r8(iq3,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq30,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq30,rinv30),crf));
+            felec            = _fjsp_mul_v2r8(qq30,_fjsp_msub_v2r8(rinv30,rinvsq30,krf2));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx30,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy30,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz30,fscal,fiz3);
+            
+            fjx0             = _fjsp_madd_v2r8(dx30,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy30,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz30,fscal,fjz0);
+
+            gmx_fjsp_decrement_1rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 108 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_fjsp_update_iforce_3atom_swizzle_v2r8(fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+                                              f+i_coord_offset+DIM,fshift+i_shift_offset);
+
+        ggid                        = gid[iidx];
+        /* Update potential energies */
+        gmx_fjsp_update_1pot_v2r8(velecsum,kernel_data->energygrp_elec+ggid);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 19 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W4_VF,outeriter*19 + inneriter*108);
+}
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecRF_VdwNone_GeomW4P1_F_sparc64_hpc_ace_double
+ * Electrostatics interaction: ReactionField
+ * VdW interaction:            None
+ * Geometry:                   Water4-Particle
+ * Calculate force/pot:        Force
+ */
+void
+nb_kernel_ElecRF_VdwNone_GeomW4P1_F_sparc64_hpc_ace_double
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset1;
+    _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwioffset3;
+    _fjsp_v2r8       ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+    int              vdwjidx0A,vdwjidx0B;
+    _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    _fjsp_v2r8       dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    _fjsp_v2r8       dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    _fjsp_v2r8       dx30,dy30,dz30,rsq30,rinv30,rinvsq30,r30,qq30,c6_30,c12_30;
+    _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    _fjsp_v2r8       itab_tmp;
+    _fjsp_v2r8       dummy_mask,cutoff_mask;
+    _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+    _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+    union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    krf              = gmx_fjsp_set1_v2r8(fr->ic->k_rf);
+    krf2             = gmx_fjsp_set1_v2r8(fr->ic->k_rf*2.0);
+    crf              = gmx_fjsp_set1_v2r8(fr->ic->c_rf);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+    iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+    iq3              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+3]));
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_fjsp_load_shift_and_3rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset+DIM,
+                                                 &ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
+
+        fix1             = _fjsp_setzero_v2r8();
+        fiy1             = _fjsp_setzero_v2r8();
+        fiz1             = _fjsp_setzero_v2r8();
+        fix2             = _fjsp_setzero_v2r8();
+        fiy2             = _fjsp_setzero_v2r8();
+        fiz2             = _fjsp_setzero_v2r8();
+        fix3             = _fjsp_setzero_v2r8();
+        fiy3             = _fjsp_setzero_v2r8();
+        fiz3             = _fjsp_setzero_v2r8();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+            dx30             = _fjsp_sub_v2r8(ix3,jx0);
+            dy30             = _fjsp_sub_v2r8(iy3,jy0);
+            dz30             = _fjsp_sub_v2r8(iz3,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+            rsq30            = gmx_fjsp_calc_rsq_v2r8(dx30,dy30,dz30);
+
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+            rinv30           = gmx_fjsp_invsqrt_v2r8(rsq30);
+
+            rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+            rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+            rinvsq30         = _fjsp_mul_v2r8(rinv30,rinv30);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_fjsp_load_2real_swizzle_v2r8(charge+jnrA+0,charge+jnrB+0);
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _fjsp_mul_v2r8(iq1,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq10,_fjsp_msub_v2r8(rinv10,rinvsq10,krf2));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _fjsp_mul_v2r8(iq2,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq20,_fjsp_msub_v2r8(rinv20,rinvsq20,krf2));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq30             = _fjsp_mul_v2r8(iq3,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq30,_fjsp_msub_v2r8(rinv30,rinvsq30,krf2));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx30,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy30,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz30,fscal,fiz3);
+            
+            fjx0             = _fjsp_madd_v2r8(dx30,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy30,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz30,fscal,fjz0);
+
+            gmx_fjsp_decrement_1rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 93 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+            dx30             = _fjsp_sub_v2r8(ix3,jx0);
+            dy30             = _fjsp_sub_v2r8(iy3,jy0);
+            dz30             = _fjsp_sub_v2r8(iz3,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+            rsq30            = gmx_fjsp_calc_rsq_v2r8(dx30,dy30,dz30);
+
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+            rinv30           = gmx_fjsp_invsqrt_v2r8(rsq30);
+
+            rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+            rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+            rinvsq30         = _fjsp_mul_v2r8(rinv30,rinv30);
+
+            /* Load parameters for j particles */
+            jq0              = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),charge+jnrA+0);
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _fjsp_mul_v2r8(iq1,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq10,_fjsp_msub_v2r8(rinv10,rinvsq10,krf2));
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _fjsp_mul_v2r8(iq2,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq20,_fjsp_msub_v2r8(rinv20,rinvsq20,krf2));
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq30             = _fjsp_mul_v2r8(iq3,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq30,_fjsp_msub_v2r8(rinv30,rinvsq30,krf2));
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx30,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy30,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz30,fscal,fiz3);
+            
+            fjx0             = _fjsp_madd_v2r8(dx30,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy30,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz30,fscal,fjz0);
+
+            gmx_fjsp_decrement_1rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 93 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_fjsp_update_iforce_3atom_swizzle_v2r8(fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+                                              f+i_coord_offset+DIM,fshift+i_shift_offset);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 18 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W4_F,outeriter*18 + inneriter*93);
+}
diff --git a/src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecRF_VdwNone_GeomW4W4_sparc64_hpc_ace_double.c b/src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecRF_VdwNone_GeomW4W4_sparc64_hpc_ace_double.c
new file mode 100644 (file)
index 0000000..b6b4e44
--- /dev/null
@@ -0,0 +1,1468 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2012, by the GROMACS development team, led by
+ * David van der Spoel, Berk Hess, Erik Lindahl, and including many
+ * others, as listed in the AUTHORS file in the top-level source
+ * directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*
+ * Note: this file was generated by the GROMACS sparc64_hpc_ace_double kernel generator.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "kernelutil_sparc64_hpc_ace_double.h"
+
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecRF_VdwNone_GeomW4W4_VF_sparc64_hpc_ace_double
+ * Electrostatics interaction: ReactionField
+ * VdW interaction:            None
+ * Geometry:                   Water4-Water4
+ * Calculate force/pot:        PotentialAndForce
+ */
+void
+nb_kernel_ElecRF_VdwNone_GeomW4W4_VF_sparc64_hpc_ace_double
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset1;
+    _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwioffset3;
+    _fjsp_v2r8       ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+    int              vdwjidx1A,vdwjidx1B;
+    _fjsp_v2r8       jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+    int              vdwjidx2A,vdwjidx2B;
+    _fjsp_v2r8       jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+    int              vdwjidx3A,vdwjidx3B;
+    _fjsp_v2r8       jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
+    _fjsp_v2r8       dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+    _fjsp_v2r8       dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+    _fjsp_v2r8       dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
+    _fjsp_v2r8       dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+    _fjsp_v2r8       dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+    _fjsp_v2r8       dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
+    _fjsp_v2r8       dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
+    _fjsp_v2r8       dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
+    _fjsp_v2r8       dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
+    _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    _fjsp_v2r8       itab_tmp;
+    _fjsp_v2r8       dummy_mask,cutoff_mask;
+    _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+    _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+    union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    krf              = gmx_fjsp_set1_v2r8(fr->ic->k_rf);
+    krf2             = gmx_fjsp_set1_v2r8(fr->ic->k_rf*2.0);
+    crf              = gmx_fjsp_set1_v2r8(fr->ic->c_rf);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+    iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+    iq3              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+3]));
+
+    jq1              = gmx_fjsp_set1_v2r8(charge[inr+1]);
+    jq2              = gmx_fjsp_set1_v2r8(charge[inr+2]);
+    jq3              = gmx_fjsp_set1_v2r8(charge[inr+3]);
+    qq11             = _fjsp_mul_v2r8(iq1,jq1);
+    qq12             = _fjsp_mul_v2r8(iq1,jq2);
+    qq13             = _fjsp_mul_v2r8(iq1,jq3);
+    qq21             = _fjsp_mul_v2r8(iq2,jq1);
+    qq22             = _fjsp_mul_v2r8(iq2,jq2);
+    qq23             = _fjsp_mul_v2r8(iq2,jq3);
+    qq31             = _fjsp_mul_v2r8(iq3,jq1);
+    qq32             = _fjsp_mul_v2r8(iq3,jq2);
+    qq33             = _fjsp_mul_v2r8(iq3,jq3);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_fjsp_load_shift_and_3rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset+DIM,
+                                                 &ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
+
+        fix1             = _fjsp_setzero_v2r8();
+        fiy1             = _fjsp_setzero_v2r8();
+        fiz1             = _fjsp_setzero_v2r8();
+        fix2             = _fjsp_setzero_v2r8();
+        fiy2             = _fjsp_setzero_v2r8();
+        fiz2             = _fjsp_setzero_v2r8();
+        fix3             = _fjsp_setzero_v2r8();
+        fiy3             = _fjsp_setzero_v2r8();
+        fiz3             = _fjsp_setzero_v2r8();
+
+        /* Reset potential sums */
+        velecsum         = _fjsp_setzero_v2r8();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_3rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA+DIM,x+j_coord_offsetB+DIM,
+                                              &jx1,&jy1,&jz1,&jx2,&jy2,&jz2,&jx3,&jy3,&jz3);
+
+            /* Calculate displacement vector */
+            dx11             = _fjsp_sub_v2r8(ix1,jx1);
+            dy11             = _fjsp_sub_v2r8(iy1,jy1);
+            dz11             = _fjsp_sub_v2r8(iz1,jz1);
+            dx12             = _fjsp_sub_v2r8(ix1,jx2);
+            dy12             = _fjsp_sub_v2r8(iy1,jy2);
+            dz12             = _fjsp_sub_v2r8(iz1,jz2);
+            dx13             = _fjsp_sub_v2r8(ix1,jx3);
+            dy13             = _fjsp_sub_v2r8(iy1,jy3);
+            dz13             = _fjsp_sub_v2r8(iz1,jz3);
+            dx21             = _fjsp_sub_v2r8(ix2,jx1);
+            dy21             = _fjsp_sub_v2r8(iy2,jy1);
+            dz21             = _fjsp_sub_v2r8(iz2,jz1);
+            dx22             = _fjsp_sub_v2r8(ix2,jx2);
+            dy22             = _fjsp_sub_v2r8(iy2,jy2);
+            dz22             = _fjsp_sub_v2r8(iz2,jz2);
+            dx23             = _fjsp_sub_v2r8(ix2,jx3);
+            dy23             = _fjsp_sub_v2r8(iy2,jy3);
+            dz23             = _fjsp_sub_v2r8(iz2,jz3);
+            dx31             = _fjsp_sub_v2r8(ix3,jx1);
+            dy31             = _fjsp_sub_v2r8(iy3,jy1);
+            dz31             = _fjsp_sub_v2r8(iz3,jz1);
+            dx32             = _fjsp_sub_v2r8(ix3,jx2);
+            dy32             = _fjsp_sub_v2r8(iy3,jy2);
+            dz32             = _fjsp_sub_v2r8(iz3,jz2);
+            dx33             = _fjsp_sub_v2r8(ix3,jx3);
+            dy33             = _fjsp_sub_v2r8(iy3,jy3);
+            dz33             = _fjsp_sub_v2r8(iz3,jz3);
+
+            /* Calculate squared distance and things based on it */
+            rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+            rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+            rsq13            = gmx_fjsp_calc_rsq_v2r8(dx13,dy13,dz13);
+            rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+            rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+            rsq23            = gmx_fjsp_calc_rsq_v2r8(dx23,dy23,dz23);
+            rsq31            = gmx_fjsp_calc_rsq_v2r8(dx31,dy31,dz31);
+            rsq32            = gmx_fjsp_calc_rsq_v2r8(dx32,dy32,dz32);
+            rsq33            = gmx_fjsp_calc_rsq_v2r8(dx33,dy33,dz33);
+
+            rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+            rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+            rinv13           = gmx_fjsp_invsqrt_v2r8(rsq13);
+            rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+            rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+            rinv23           = gmx_fjsp_invsqrt_v2r8(rsq23);
+            rinv31           = gmx_fjsp_invsqrt_v2r8(rsq31);
+            rinv32           = gmx_fjsp_invsqrt_v2r8(rsq32);
+            rinv33           = gmx_fjsp_invsqrt_v2r8(rsq33);
+
+            rinvsq11         = _fjsp_mul_v2r8(rinv11,rinv11);
+            rinvsq12         = _fjsp_mul_v2r8(rinv12,rinv12);
+            rinvsq13         = _fjsp_mul_v2r8(rinv13,rinv13);
+            rinvsq21         = _fjsp_mul_v2r8(rinv21,rinv21);
+            rinvsq22         = _fjsp_mul_v2r8(rinv22,rinv22);
+            rinvsq23         = _fjsp_mul_v2r8(rinv23,rinv23);
+            rinvsq31         = _fjsp_mul_v2r8(rinv31,rinv31);
+            rinvsq32         = _fjsp_mul_v2r8(rinv32,rinv32);
+            rinvsq33         = _fjsp_mul_v2r8(rinv33,rinv33);
+
+            fjx1             = _fjsp_setzero_v2r8();
+            fjy1             = _fjsp_setzero_v2r8();
+            fjz1             = _fjsp_setzero_v2r8();
+            fjx2             = _fjsp_setzero_v2r8();
+            fjy2             = _fjsp_setzero_v2r8();
+            fjz2             = _fjsp_setzero_v2r8();
+            fjx3             = _fjsp_setzero_v2r8();
+            fjy3             = _fjsp_setzero_v2r8();
+            fjz3             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq11,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq11,rinv11),crf));
+            felec            = _fjsp_mul_v2r8(qq11,_fjsp_msub_v2r8(rinv11,rinvsq11,krf2));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+            
+            fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq12,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq12,rinv12),crf));
+            felec            = _fjsp_mul_v2r8(qq12,_fjsp_msub_v2r8(rinv12,rinvsq12,krf2));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+            
+            fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq13,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq13,rinv13),crf));
+            felec            = _fjsp_mul_v2r8(qq13,_fjsp_msub_v2r8(rinv13,rinvsq13,krf2));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx13,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy13,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz13,fscal,fiz1);
+            
+            fjx3             = _fjsp_madd_v2r8(dx13,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy13,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz13,fscal,fjz3);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq21,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq21,rinv21),crf));
+            felec            = _fjsp_mul_v2r8(qq21,_fjsp_msub_v2r8(rinv21,rinvsq21,krf2));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+            
+            fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq22,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq22,rinv22),crf));
+            felec            = _fjsp_mul_v2r8(qq22,_fjsp_msub_v2r8(rinv22,rinvsq22,krf2));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+            
+            fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq23,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq23,rinv23),crf));
+            felec            = _fjsp_mul_v2r8(qq23,_fjsp_msub_v2r8(rinv23,rinvsq23,krf2));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx23,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy23,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz23,fscal,fiz2);
+            
+            fjx3             = _fjsp_madd_v2r8(dx23,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy23,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz23,fscal,fjz3);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq31,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq31,rinv31),crf));
+            felec            = _fjsp_mul_v2r8(qq31,_fjsp_msub_v2r8(rinv31,rinvsq31,krf2));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx31,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy31,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz31,fscal,fiz3);
+            
+            fjx1             = _fjsp_madd_v2r8(dx31,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy31,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz31,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq32,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq32,rinv32),crf));
+            felec            = _fjsp_mul_v2r8(qq32,_fjsp_msub_v2r8(rinv32,rinvsq32,krf2));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx32,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy32,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz32,fscal,fiz3);
+            
+            fjx2             = _fjsp_madd_v2r8(dx32,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy32,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz32,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq33,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq33,rinv33),crf));
+            felec            = _fjsp_mul_v2r8(qq33,_fjsp_msub_v2r8(rinv33,rinvsq33,krf2));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx33,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy33,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz33,fscal,fiz3);
+            
+            fjx3             = _fjsp_madd_v2r8(dx33,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy33,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz33,fscal,fjz3);
+
+            gmx_fjsp_decrement_3rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA+DIM,f+j_coord_offsetB+DIM,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+
+            /* Inner loop uses 315 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_3rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA+DIM,
+                                              &jx1,&jy1,&jz1,&jx2,&jy2,&jz2,&jx3,&jy3,&jz3);
+
+            /* Calculate displacement vector */
+            dx11             = _fjsp_sub_v2r8(ix1,jx1);
+            dy11             = _fjsp_sub_v2r8(iy1,jy1);
+            dz11             = _fjsp_sub_v2r8(iz1,jz1);
+            dx12             = _fjsp_sub_v2r8(ix1,jx2);
+            dy12             = _fjsp_sub_v2r8(iy1,jy2);
+            dz12             = _fjsp_sub_v2r8(iz1,jz2);
+            dx13             = _fjsp_sub_v2r8(ix1,jx3);
+            dy13             = _fjsp_sub_v2r8(iy1,jy3);
+            dz13             = _fjsp_sub_v2r8(iz1,jz3);
+            dx21             = _fjsp_sub_v2r8(ix2,jx1);
+            dy21             = _fjsp_sub_v2r8(iy2,jy1);
+            dz21             = _fjsp_sub_v2r8(iz2,jz1);
+            dx22             = _fjsp_sub_v2r8(ix2,jx2);
+            dy22             = _fjsp_sub_v2r8(iy2,jy2);
+            dz22             = _fjsp_sub_v2r8(iz2,jz2);
+            dx23             = _fjsp_sub_v2r8(ix2,jx3);
+            dy23             = _fjsp_sub_v2r8(iy2,jy3);
+            dz23             = _fjsp_sub_v2r8(iz2,jz3);
+            dx31             = _fjsp_sub_v2r8(ix3,jx1);
+            dy31             = _fjsp_sub_v2r8(iy3,jy1);
+            dz31             = _fjsp_sub_v2r8(iz3,jz1);
+            dx32             = _fjsp_sub_v2r8(ix3,jx2);
+            dy32             = _fjsp_sub_v2r8(iy3,jy2);
+            dz32             = _fjsp_sub_v2r8(iz3,jz2);
+            dx33             = _fjsp_sub_v2r8(ix3,jx3);
+            dy33             = _fjsp_sub_v2r8(iy3,jy3);
+            dz33             = _fjsp_sub_v2r8(iz3,jz3);
+
+            /* Calculate squared distance and things based on it */
+            rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+            rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+            rsq13            = gmx_fjsp_calc_rsq_v2r8(dx13,dy13,dz13);
+            rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+            rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+            rsq23            = gmx_fjsp_calc_rsq_v2r8(dx23,dy23,dz23);
+            rsq31            = gmx_fjsp_calc_rsq_v2r8(dx31,dy31,dz31);
+            rsq32            = gmx_fjsp_calc_rsq_v2r8(dx32,dy32,dz32);
+            rsq33            = gmx_fjsp_calc_rsq_v2r8(dx33,dy33,dz33);
+
+            rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+            rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+            rinv13           = gmx_fjsp_invsqrt_v2r8(rsq13);
+            rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+            rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+            rinv23           = gmx_fjsp_invsqrt_v2r8(rsq23);
+            rinv31           = gmx_fjsp_invsqrt_v2r8(rsq31);
+            rinv32           = gmx_fjsp_invsqrt_v2r8(rsq32);
+            rinv33           = gmx_fjsp_invsqrt_v2r8(rsq33);
+
+            rinvsq11         = _fjsp_mul_v2r8(rinv11,rinv11);
+            rinvsq12         = _fjsp_mul_v2r8(rinv12,rinv12);
+            rinvsq13         = _fjsp_mul_v2r8(rinv13,rinv13);
+            rinvsq21         = _fjsp_mul_v2r8(rinv21,rinv21);
+            rinvsq22         = _fjsp_mul_v2r8(rinv22,rinv22);
+            rinvsq23         = _fjsp_mul_v2r8(rinv23,rinv23);
+            rinvsq31         = _fjsp_mul_v2r8(rinv31,rinv31);
+            rinvsq32         = _fjsp_mul_v2r8(rinv32,rinv32);
+            rinvsq33         = _fjsp_mul_v2r8(rinv33,rinv33);
+
+            fjx1             = _fjsp_setzero_v2r8();
+            fjy1             = _fjsp_setzero_v2r8();
+            fjz1             = _fjsp_setzero_v2r8();
+            fjx2             = _fjsp_setzero_v2r8();
+            fjy2             = _fjsp_setzero_v2r8();
+            fjz2             = _fjsp_setzero_v2r8();
+            fjx3             = _fjsp_setzero_v2r8();
+            fjy3             = _fjsp_setzero_v2r8();
+            fjz3             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq11,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq11,rinv11),crf));
+            felec            = _fjsp_mul_v2r8(qq11,_fjsp_msub_v2r8(rinv11,rinvsq11,krf2));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+            
+            fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq12,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq12,rinv12),crf));
+            felec            = _fjsp_mul_v2r8(qq12,_fjsp_msub_v2r8(rinv12,rinvsq12,krf2));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+            
+            fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq13,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq13,rinv13),crf));
+            felec            = _fjsp_mul_v2r8(qq13,_fjsp_msub_v2r8(rinv13,rinvsq13,krf2));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx13,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy13,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz13,fscal,fiz1);
+            
+            fjx3             = _fjsp_madd_v2r8(dx13,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy13,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz13,fscal,fjz3);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq21,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq21,rinv21),crf));
+            felec            = _fjsp_mul_v2r8(qq21,_fjsp_msub_v2r8(rinv21,rinvsq21,krf2));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+            
+            fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq22,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq22,rinv22),crf));
+            felec            = _fjsp_mul_v2r8(qq22,_fjsp_msub_v2r8(rinv22,rinvsq22,krf2));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+            
+            fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq23,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq23,rinv23),crf));
+            felec            = _fjsp_mul_v2r8(qq23,_fjsp_msub_v2r8(rinv23,rinvsq23,krf2));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx23,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy23,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz23,fscal,fiz2);
+            
+            fjx3             = _fjsp_madd_v2r8(dx23,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy23,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz23,fscal,fjz3);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq31,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq31,rinv31),crf));
+            felec            = _fjsp_mul_v2r8(qq31,_fjsp_msub_v2r8(rinv31,rinvsq31,krf2));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx31,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy31,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz31,fscal,fiz3);
+            
+            fjx1             = _fjsp_madd_v2r8(dx31,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy31,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz31,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq32,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq32,rinv32),crf));
+            felec            = _fjsp_mul_v2r8(qq32,_fjsp_msub_v2r8(rinv32,rinvsq32,krf2));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx32,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy32,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz32,fscal,fiz3);
+            
+            fjx2             = _fjsp_madd_v2r8(dx32,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy32,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz32,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq33,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq33,rinv33),crf));
+            felec            = _fjsp_mul_v2r8(qq33,_fjsp_msub_v2r8(rinv33,rinvsq33,krf2));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx33,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy33,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz33,fscal,fiz3);
+            
+            fjx3             = _fjsp_madd_v2r8(dx33,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy33,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz33,fscal,fjz3);
+
+            gmx_fjsp_decrement_3rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA+DIM,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+
+            /* Inner loop uses 315 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_fjsp_update_iforce_3atom_swizzle_v2r8(fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+                                              f+i_coord_offset+DIM,fshift+i_shift_offset);
+
+        ggid                        = gid[iidx];
+        /* Update potential energies */
+        gmx_fjsp_update_1pot_v2r8(velecsum,kernel_data->energygrp_elec+ggid);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 19 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W4W4_VF,outeriter*19 + inneriter*315);
+}
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecRF_VdwNone_GeomW4W4_F_sparc64_hpc_ace_double
+ * Electrostatics interaction: ReactionField
+ * VdW interaction:            None
+ * Geometry:                   Water4-Water4
+ * Calculate force/pot:        Force
+ */
+void
+nb_kernel_ElecRF_VdwNone_GeomW4W4_F_sparc64_hpc_ace_double
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset1;
+    _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwioffset3;
+    _fjsp_v2r8       ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+    int              vdwjidx1A,vdwjidx1B;
+    _fjsp_v2r8       jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+    int              vdwjidx2A,vdwjidx2B;
+    _fjsp_v2r8       jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+    int              vdwjidx3A,vdwjidx3B;
+    _fjsp_v2r8       jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
+    _fjsp_v2r8       dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+    _fjsp_v2r8       dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+    _fjsp_v2r8       dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
+    _fjsp_v2r8       dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+    _fjsp_v2r8       dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+    _fjsp_v2r8       dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
+    _fjsp_v2r8       dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
+    _fjsp_v2r8       dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
+    _fjsp_v2r8       dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
+    _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    _fjsp_v2r8       itab_tmp;
+    _fjsp_v2r8       dummy_mask,cutoff_mask;
+    _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+    _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+    union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    krf              = gmx_fjsp_set1_v2r8(fr->ic->k_rf);
+    krf2             = gmx_fjsp_set1_v2r8(fr->ic->k_rf*2.0);
+    crf              = gmx_fjsp_set1_v2r8(fr->ic->c_rf);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+    iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+    iq3              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+3]));
+
+    jq1              = gmx_fjsp_set1_v2r8(charge[inr+1]);
+    jq2              = gmx_fjsp_set1_v2r8(charge[inr+2]);
+    jq3              = gmx_fjsp_set1_v2r8(charge[inr+3]);
+    qq11             = _fjsp_mul_v2r8(iq1,jq1);
+    qq12             = _fjsp_mul_v2r8(iq1,jq2);
+    qq13             = _fjsp_mul_v2r8(iq1,jq3);
+    qq21             = _fjsp_mul_v2r8(iq2,jq1);
+    qq22             = _fjsp_mul_v2r8(iq2,jq2);
+    qq23             = _fjsp_mul_v2r8(iq2,jq3);
+    qq31             = _fjsp_mul_v2r8(iq3,jq1);
+    qq32             = _fjsp_mul_v2r8(iq3,jq2);
+    qq33             = _fjsp_mul_v2r8(iq3,jq3);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_fjsp_load_shift_and_3rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset+DIM,
+                                                 &ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
+
+        fix1             = _fjsp_setzero_v2r8();
+        fiy1             = _fjsp_setzero_v2r8();
+        fiz1             = _fjsp_setzero_v2r8();
+        fix2             = _fjsp_setzero_v2r8();
+        fiy2             = _fjsp_setzero_v2r8();
+        fiz2             = _fjsp_setzero_v2r8();
+        fix3             = _fjsp_setzero_v2r8();
+        fiy3             = _fjsp_setzero_v2r8();
+        fiz3             = _fjsp_setzero_v2r8();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_3rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA+DIM,x+j_coord_offsetB+DIM,
+                                              &jx1,&jy1,&jz1,&jx2,&jy2,&jz2,&jx3,&jy3,&jz3);
+
+            /* Calculate displacement vector */
+            dx11             = _fjsp_sub_v2r8(ix1,jx1);
+            dy11             = _fjsp_sub_v2r8(iy1,jy1);
+            dz11             = _fjsp_sub_v2r8(iz1,jz1);
+            dx12             = _fjsp_sub_v2r8(ix1,jx2);
+            dy12             = _fjsp_sub_v2r8(iy1,jy2);
+            dz12             = _fjsp_sub_v2r8(iz1,jz2);
+            dx13             = _fjsp_sub_v2r8(ix1,jx3);
+            dy13             = _fjsp_sub_v2r8(iy1,jy3);
+            dz13             = _fjsp_sub_v2r8(iz1,jz3);
+            dx21             = _fjsp_sub_v2r8(ix2,jx1);
+            dy21             = _fjsp_sub_v2r8(iy2,jy1);
+            dz21             = _fjsp_sub_v2r8(iz2,jz1);
+            dx22             = _fjsp_sub_v2r8(ix2,jx2);
+            dy22             = _fjsp_sub_v2r8(iy2,jy2);
+            dz22             = _fjsp_sub_v2r8(iz2,jz2);
+            dx23             = _fjsp_sub_v2r8(ix2,jx3);
+            dy23             = _fjsp_sub_v2r8(iy2,jy3);
+            dz23             = _fjsp_sub_v2r8(iz2,jz3);
+            dx31             = _fjsp_sub_v2r8(ix3,jx1);
+            dy31             = _fjsp_sub_v2r8(iy3,jy1);
+            dz31             = _fjsp_sub_v2r8(iz3,jz1);
+            dx32             = _fjsp_sub_v2r8(ix3,jx2);
+            dy32             = _fjsp_sub_v2r8(iy3,jy2);
+            dz32             = _fjsp_sub_v2r8(iz3,jz2);
+            dx33             = _fjsp_sub_v2r8(ix3,jx3);
+            dy33             = _fjsp_sub_v2r8(iy3,jy3);
+            dz33             = _fjsp_sub_v2r8(iz3,jz3);
+
+            /* Calculate squared distance and things based on it */
+            rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+            rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+            rsq13            = gmx_fjsp_calc_rsq_v2r8(dx13,dy13,dz13);
+            rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+            rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+            rsq23            = gmx_fjsp_calc_rsq_v2r8(dx23,dy23,dz23);
+            rsq31            = gmx_fjsp_calc_rsq_v2r8(dx31,dy31,dz31);
+            rsq32            = gmx_fjsp_calc_rsq_v2r8(dx32,dy32,dz32);
+            rsq33            = gmx_fjsp_calc_rsq_v2r8(dx33,dy33,dz33);
+
+            rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+            rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+            rinv13           = gmx_fjsp_invsqrt_v2r8(rsq13);
+            rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+            rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+            rinv23           = gmx_fjsp_invsqrt_v2r8(rsq23);
+            rinv31           = gmx_fjsp_invsqrt_v2r8(rsq31);
+            rinv32           = gmx_fjsp_invsqrt_v2r8(rsq32);
+            rinv33           = gmx_fjsp_invsqrt_v2r8(rsq33);
+
+            rinvsq11         = _fjsp_mul_v2r8(rinv11,rinv11);
+            rinvsq12         = _fjsp_mul_v2r8(rinv12,rinv12);
+            rinvsq13         = _fjsp_mul_v2r8(rinv13,rinv13);
+            rinvsq21         = _fjsp_mul_v2r8(rinv21,rinv21);
+            rinvsq22         = _fjsp_mul_v2r8(rinv22,rinv22);
+            rinvsq23         = _fjsp_mul_v2r8(rinv23,rinv23);
+            rinvsq31         = _fjsp_mul_v2r8(rinv31,rinv31);
+            rinvsq32         = _fjsp_mul_v2r8(rinv32,rinv32);
+            rinvsq33         = _fjsp_mul_v2r8(rinv33,rinv33);
+
+            fjx1             = _fjsp_setzero_v2r8();
+            fjy1             = _fjsp_setzero_v2r8();
+            fjz1             = _fjsp_setzero_v2r8();
+            fjx2             = _fjsp_setzero_v2r8();
+            fjy2             = _fjsp_setzero_v2r8();
+            fjz2             = _fjsp_setzero_v2r8();
+            fjx3             = _fjsp_setzero_v2r8();
+            fjy3             = _fjsp_setzero_v2r8();
+            fjz3             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq11,_fjsp_msub_v2r8(rinv11,rinvsq11,krf2));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+            
+            fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq12,_fjsp_msub_v2r8(rinv12,rinvsq12,krf2));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+            
+            fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq13,_fjsp_msub_v2r8(rinv13,rinvsq13,krf2));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx13,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy13,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz13,fscal,fiz1);
+            
+            fjx3             = _fjsp_madd_v2r8(dx13,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy13,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz13,fscal,fjz3);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq21,_fjsp_msub_v2r8(rinv21,rinvsq21,krf2));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+            
+            fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq22,_fjsp_msub_v2r8(rinv22,rinvsq22,krf2));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+            
+            fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq23,_fjsp_msub_v2r8(rinv23,rinvsq23,krf2));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx23,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy23,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz23,fscal,fiz2);
+            
+            fjx3             = _fjsp_madd_v2r8(dx23,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy23,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz23,fscal,fjz3);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq31,_fjsp_msub_v2r8(rinv31,rinvsq31,krf2));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx31,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy31,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz31,fscal,fiz3);
+            
+            fjx1             = _fjsp_madd_v2r8(dx31,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy31,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz31,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq32,_fjsp_msub_v2r8(rinv32,rinvsq32,krf2));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx32,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy32,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz32,fscal,fiz3);
+            
+            fjx2             = _fjsp_madd_v2r8(dx32,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy32,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz32,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq33,_fjsp_msub_v2r8(rinv33,rinvsq33,krf2));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx33,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy33,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz33,fscal,fiz3);
+            
+            fjx3             = _fjsp_madd_v2r8(dx33,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy33,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz33,fscal,fjz3);
+
+            gmx_fjsp_decrement_3rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA+DIM,f+j_coord_offsetB+DIM,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+
+            /* Inner loop uses 270 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_3rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA+DIM,
+                                              &jx1,&jy1,&jz1,&jx2,&jy2,&jz2,&jx3,&jy3,&jz3);
+
+            /* Calculate displacement vector */
+            dx11             = _fjsp_sub_v2r8(ix1,jx1);
+            dy11             = _fjsp_sub_v2r8(iy1,jy1);
+            dz11             = _fjsp_sub_v2r8(iz1,jz1);
+            dx12             = _fjsp_sub_v2r8(ix1,jx2);
+            dy12             = _fjsp_sub_v2r8(iy1,jy2);
+            dz12             = _fjsp_sub_v2r8(iz1,jz2);
+            dx13             = _fjsp_sub_v2r8(ix1,jx3);
+            dy13             = _fjsp_sub_v2r8(iy1,jy3);
+            dz13             = _fjsp_sub_v2r8(iz1,jz3);
+            dx21             = _fjsp_sub_v2r8(ix2,jx1);
+            dy21             = _fjsp_sub_v2r8(iy2,jy1);
+            dz21             = _fjsp_sub_v2r8(iz2,jz1);
+            dx22             = _fjsp_sub_v2r8(ix2,jx2);
+            dy22             = _fjsp_sub_v2r8(iy2,jy2);
+            dz22             = _fjsp_sub_v2r8(iz2,jz2);
+            dx23             = _fjsp_sub_v2r8(ix2,jx3);
+            dy23             = _fjsp_sub_v2r8(iy2,jy3);
+            dz23             = _fjsp_sub_v2r8(iz2,jz3);
+            dx31             = _fjsp_sub_v2r8(ix3,jx1);
+            dy31             = _fjsp_sub_v2r8(iy3,jy1);
+            dz31             = _fjsp_sub_v2r8(iz3,jz1);
+            dx32             = _fjsp_sub_v2r8(ix3,jx2);
+            dy32             = _fjsp_sub_v2r8(iy3,jy2);
+            dz32             = _fjsp_sub_v2r8(iz3,jz2);
+            dx33             = _fjsp_sub_v2r8(ix3,jx3);
+            dy33             = _fjsp_sub_v2r8(iy3,jy3);
+            dz33             = _fjsp_sub_v2r8(iz3,jz3);
+
+            /* Calculate squared distance and things based on it */
+            rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+            rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+            rsq13            = gmx_fjsp_calc_rsq_v2r8(dx13,dy13,dz13);
+            rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+            rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+            rsq23            = gmx_fjsp_calc_rsq_v2r8(dx23,dy23,dz23);
+            rsq31            = gmx_fjsp_calc_rsq_v2r8(dx31,dy31,dz31);
+            rsq32            = gmx_fjsp_calc_rsq_v2r8(dx32,dy32,dz32);
+            rsq33            = gmx_fjsp_calc_rsq_v2r8(dx33,dy33,dz33);
+
+            rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+            rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+            rinv13           = gmx_fjsp_invsqrt_v2r8(rsq13);
+            rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+            rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+            rinv23           = gmx_fjsp_invsqrt_v2r8(rsq23);
+            rinv31           = gmx_fjsp_invsqrt_v2r8(rsq31);
+            rinv32           = gmx_fjsp_invsqrt_v2r8(rsq32);
+            rinv33           = gmx_fjsp_invsqrt_v2r8(rsq33);
+
+            rinvsq11         = _fjsp_mul_v2r8(rinv11,rinv11);
+            rinvsq12         = _fjsp_mul_v2r8(rinv12,rinv12);
+            rinvsq13         = _fjsp_mul_v2r8(rinv13,rinv13);
+            rinvsq21         = _fjsp_mul_v2r8(rinv21,rinv21);
+            rinvsq22         = _fjsp_mul_v2r8(rinv22,rinv22);
+            rinvsq23         = _fjsp_mul_v2r8(rinv23,rinv23);
+            rinvsq31         = _fjsp_mul_v2r8(rinv31,rinv31);
+            rinvsq32         = _fjsp_mul_v2r8(rinv32,rinv32);
+            rinvsq33         = _fjsp_mul_v2r8(rinv33,rinv33);
+
+            fjx1             = _fjsp_setzero_v2r8();
+            fjy1             = _fjsp_setzero_v2r8();
+            fjz1             = _fjsp_setzero_v2r8();
+            fjx2             = _fjsp_setzero_v2r8();
+            fjy2             = _fjsp_setzero_v2r8();
+            fjz2             = _fjsp_setzero_v2r8();
+            fjx3             = _fjsp_setzero_v2r8();
+            fjy3             = _fjsp_setzero_v2r8();
+            fjz3             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq11,_fjsp_msub_v2r8(rinv11,rinvsq11,krf2));
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+            
+            fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq12,_fjsp_msub_v2r8(rinv12,rinvsq12,krf2));
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+            
+            fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq13,_fjsp_msub_v2r8(rinv13,rinvsq13,krf2));
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx13,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy13,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz13,fscal,fiz1);
+            
+            fjx3             = _fjsp_madd_v2r8(dx13,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy13,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz13,fscal,fjz3);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq21,_fjsp_msub_v2r8(rinv21,rinvsq21,krf2));
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+            
+            fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq22,_fjsp_msub_v2r8(rinv22,rinvsq22,krf2));
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+            
+            fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq23,_fjsp_msub_v2r8(rinv23,rinvsq23,krf2));
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx23,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy23,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz23,fscal,fiz2);
+            
+            fjx3             = _fjsp_madd_v2r8(dx23,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy23,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz23,fscal,fjz3);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq31,_fjsp_msub_v2r8(rinv31,rinvsq31,krf2));
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx31,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy31,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz31,fscal,fiz3);
+            
+            fjx1             = _fjsp_madd_v2r8(dx31,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy31,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz31,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq32,_fjsp_msub_v2r8(rinv32,rinvsq32,krf2));
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx32,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy32,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz32,fscal,fiz3);
+            
+            fjx2             = _fjsp_madd_v2r8(dx32,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy32,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz32,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq33,_fjsp_msub_v2r8(rinv33,rinvsq33,krf2));
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx33,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy33,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz33,fscal,fiz3);
+            
+            fjx3             = _fjsp_madd_v2r8(dx33,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy33,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz33,fscal,fjz3);
+
+            gmx_fjsp_decrement_3rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA+DIM,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+
+            /* Inner loop uses 270 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_fjsp_update_iforce_3atom_swizzle_v2r8(fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+                                              f+i_coord_offset+DIM,fshift+i_shift_offset);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 18 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W4W4_F,outeriter*18 + inneriter*270);
+}
diff --git a/src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_sparc64_hpc_ace_double.c b/src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_sparc64_hpc_ace_double.c
new file mode 100644 (file)
index 0000000..5e07e4b
--- /dev/null
@@ -0,0 +1,481 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2012, by the GROMACS development team, led by
+ * David van der Spoel, Berk Hess, Erik Lindahl, and including many
+ * others, as listed in the AUTHORS file in the top-level source
+ * directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*
+ * Note: this file was generated by the GROMACS sparc64_hpc_ace_double kernel generator.
+ */
+#ifndef nb_kernel_sparc64_hpc_ace_double_h
+#define nb_kernel_sparc64_hpc_ace_double_h
+
+#include "../nb_kernel.h"
+
+nb_kernel_t nb_kernel_ElecNone_VdwLJ_GeomP1P1_VF_sparc64_hpc_ace_double;
+nb_kernel_t nb_kernel_ElecNone_VdwLJ_GeomP1P1_F_sparc64_hpc_ace_double;
+nb_kernel_t nb_kernel_ElecNone_VdwLJSh_GeomP1P1_VF_sparc64_hpc_ace_double;
+nb_kernel_t nb_kernel_ElecNone_VdwLJSh_GeomP1P1_F_sparc64_hpc_ace_double;
+nb_kernel_t nb_kernel_ElecNone_VdwLJSw_GeomP1P1_VF_sparc64_hpc_ace_double;
+nb_kernel_t nb_kernel_ElecNone_VdwLJSw_GeomP1P1_F_sparc64_hpc_ace_double;
+nb_kernel_t nb_kernel_ElecNone_VdwCSTab_GeomP1P1_VF_sparc64_hpc_ace_double;
+nb_kernel_t nb_kernel_ElecNone_VdwCSTab_GeomP1P1_F_sparc64_hpc_ace_double;
+nb_kernel_t nb_kernel_ElecEw_VdwLJ_GeomP1P1_VF_sparc64_hpc_ace_double;
+nb_kernel_t nb_kernel_ElecEw_VdwLJ_GeomP1P1_F_sparc64_hpc_ace_double;
+nb_kernel_t nb_kernel_ElecEw_VdwLJ_GeomW3P1_VF_sparc64_hpc_ace_double;
+nb_kernel_t nb_kernel_ElecEw_VdwLJ_GeomW3P1_F_sparc64_hpc_ace_double;
+nb_kernel_t nb_kernel_ElecEw_VdwLJ_GeomW3W3_VF_sparc64_hpc_ace_double;
+nb_kernel_t nb_kernel_ElecEw_VdwLJ_GeomW3W3_F_sparc64_hpc_ace_double;
+nb_kernel_t nb_kernel_ElecEw_VdwLJ_GeomW4P1_VF_sparc64_hpc_ace_double;
+nb_kernel_t nb_kernel_ElecEw_VdwLJ_GeomW4P1_F_sparc64_hpc_ace_double;
+nb_kernel_t nb_kernel_ElecEw_VdwLJ_GeomW4W4_VF_sparc64_hpc_ace_double;
+nb_kernel_t nb_kernel_ElecEw_VdwLJ_GeomW4W4_F_sparc64_hpc_ace_double;
+nb_kernel_t nb_kernel_ElecEw_VdwNone_GeomP1P1_VF_sparc64_hpc_ace_double;
+nb_kernel_t nb_kernel_ElecEw_VdwNone_GeomP1P1_F_sparc64_hpc_ace_double;
+nb_kernel_t nb_kernel_ElecEw_VdwNone_GeomW3P1_VF_sparc64_hpc_ace_double;
+nb_kernel_t nb_kernel_ElecEw_VdwNone_GeomW3P1_F_sparc64_hpc_ace_double;
+nb_kernel_t nb_kernel_ElecEw_VdwNone_GeomW3W3_VF_sparc64_hpc_ace_double;
+nb_kernel_t nb_kernel_ElecEw_VdwNone_GeomW3W3_F_sparc64_hpc_ace_double;
+nb_kernel_t nb_kernel_ElecEw_VdwNone_GeomW4P1_VF_sparc64_hpc_ace_double;
+nb_kernel_t nb_kernel_ElecEw_VdwNone_GeomW4P1_F_sparc64_hpc_ace_double;
+nb_kernel_t nb_kernel_ElecEw_VdwNone_GeomW4W4_VF_sparc64_hpc_ace_double;
+nb_kernel_t nb_kernel_ElecEw_VdwNone_GeomW4W4_F_sparc64_hpc_ace_double;
+nb_kernel_t nb_kernel_ElecEw_VdwCSTab_GeomP1P1_VF_sparc64_hpc_ace_double;
+nb_kernel_t nb_kernel_ElecEw_VdwCSTab_GeomP1P1_F_sparc64_hpc_ace_double;
+nb_kernel_t nb_kernel_ElecEw_VdwCSTab_GeomW3P1_VF_sparc64_hpc_ace_double;
+nb_kernel_t nb_kernel_ElecEw_VdwCSTab_GeomW3P1_F_sparc64_hpc_ace_double;
+nb_kernel_t nb_kernel_ElecEw_VdwCSTab_GeomW3W3_VF_sparc64_hpc_ace_double;
+nb_kernel_t nb_kernel_ElecEw_VdwCSTab_GeomW3W3_F_sparc64_hpc_ace_double;
+nb_kernel_t nb_kernel_ElecEw_VdwCSTab_GeomW4P1_VF_sparc64_hpc_ace_double;
+nb_kernel_t nb_kernel_ElecEw_VdwCSTab_GeomW4P1_F_sparc64_hpc_ace_double;
+nb_kernel_t nb_kernel_ElecEw_VdwCSTab_GeomW4W4_VF_sparc64_hpc_ace_double;
+nb_kernel_t nb_kernel_ElecEw_VdwCSTab_GeomW4W4_F_sparc64_hpc_ace_double;
+nb_kernel_t nb_kernel_ElecEwSh_VdwLJSh_GeomP1P1_VF_sparc64_hpc_ace_double;
+nb_kernel_t nb_kernel_ElecEwSh_VdwLJSh_GeomP1P1_F_sparc64_hpc_ace_double;
+nb_kernel_t nb_kernel_ElecEwSh_VdwLJSh_GeomW3P1_VF_sparc64_hpc_ace_double;
+nb_kernel_t nb_kernel_ElecEwSh_VdwLJSh_GeomW3P1_F_sparc64_hpc_ace_double;
+nb_kernel_t nb_kernel_ElecEwSh_VdwLJSh_GeomW3W3_VF_sparc64_hpc_ace_double;
+nb_kernel_t nb_kernel_ElecEwSh_VdwLJSh_GeomW3W3_F_sparc64_hpc_ace_double;
+nb_kernel_t nb_kernel_ElecEwSh_VdwLJSh_GeomW4P1_VF_sparc64_hpc_ace_double;
+nb_kernel_t nb_kernel_ElecEwSh_VdwLJSh_GeomW4P1_F_sparc64_hpc_ace_double;
+nb_kernel_t nb_kernel_ElecEwSh_VdwLJSh_GeomW4W4_VF_sparc64_hpc_ace_double;
+nb_kernel_t nb_kernel_ElecEwSh_VdwLJSh_GeomW4W4_F_sparc64_hpc_ace_double;
+nb_kernel_t nb_kernel_ElecEwSh_VdwNone_GeomP1P1_VF_sparc64_hpc_ace_double;
+nb_kernel_t nb_kernel_ElecEwSh_VdwNone_GeomP1P1_F_sparc64_hpc_ace_double;
+nb_kernel_t nb_kernel_ElecEwSh_VdwNone_GeomW3P1_VF_sparc64_hpc_ace_double;
+nb_kernel_t nb_kernel_ElecEwSh_VdwNone_GeomW3P1_F_sparc64_hpc_ace_double;
+nb_kernel_t nb_kernel_ElecEwSh_VdwNone_GeomW3W3_VF_sparc64_hpc_ace_double;
+nb_kernel_t nb_kernel_ElecEwSh_VdwNone_GeomW3W3_F_sparc64_hpc_ace_double;
+nb_kernel_t nb_kernel_ElecEwSh_VdwNone_GeomW4P1_VF_sparc64_hpc_ace_double;
+nb_kernel_t nb_kernel_ElecEwSh_VdwNone_GeomW4P1_F_sparc64_hpc_ace_double;
+nb_kernel_t nb_kernel_ElecEwSh_VdwNone_GeomW4W4_VF_sparc64_hpc_ace_double;
+nb_kernel_t nb_kernel_ElecEwSh_VdwNone_GeomW4W4_F_sparc64_hpc_ace_double;
+nb_kernel_t nb_kernel_ElecEwSw_VdwLJSw_GeomP1P1_VF_sparc64_hpc_ace_double;
+nb_kernel_t nb_kernel_ElecEwSw_VdwLJSw_GeomP1P1_F_sparc64_hpc_ace_double;
+nb_kernel_t nb_kernel_ElecEwSw_VdwLJSw_GeomW3P1_VF_sparc64_hpc_ace_double;
+nb_kernel_t nb_kernel_ElecEwSw_VdwLJSw_GeomW3P1_F_sparc64_hpc_ace_double;
+nb_kernel_t nb_kernel_ElecEwSw_VdwLJSw_GeomW3W3_VF_sparc64_hpc_ace_double;
+nb_kernel_t nb_kernel_ElecEwSw_VdwLJSw_GeomW3W3_F_sparc64_hpc_ace_double;
+nb_kernel_t nb_kernel_ElecEwSw_VdwLJSw_GeomW4P1_VF_sparc64_hpc_ace_double;
+nb_kernel_t nb_kernel_ElecEwSw_VdwLJSw_GeomW4P1_F_sparc64_hpc_ace_double;
+nb_kernel_t nb_kernel_ElecEwSw_VdwLJSw_GeomW4W4_VF_sparc64_hpc_ace_double;
+nb_kernel_t nb_kernel_ElecEwSw_VdwLJSw_GeomW4W4_F_sparc64_hpc_ace_double;
+nb_kernel_t nb_kernel_ElecEwSw_VdwNone_GeomP1P1_VF_sparc64_hpc_ace_double;
+nb_kernel_t nb_kernel_ElecEwSw_VdwNone_GeomP1P1_F_sparc64_hpc_ace_double;
+nb_kernel_t nb_kernel_ElecEwSw_VdwNone_GeomW3P1_VF_sparc64_hpc_ace_double;
+nb_kernel_t nb_kernel_ElecEwSw_VdwNone_GeomW3P1_F_sparc64_hpc_ace_double;
+nb_kernel_t nb_kernel_ElecEwSw_VdwNone_GeomW3W3_VF_sparc64_hpc_ace_double;
+nb_kernel_t nb_kernel_ElecEwSw_VdwNone_GeomW3W3_F_sparc64_hpc_ace_double;
+nb_kernel_t nb_kernel_ElecEwSw_VdwNone_GeomW4P1_VF_sparc64_hpc_ace_double;
+nb_kernel_t nb_kernel_ElecEwSw_VdwNone_GeomW4P1_F_sparc64_hpc_ace_double;
+nb_kernel_t nb_kernel_ElecEwSw_VdwNone_GeomW4W4_VF_sparc64_hpc_ace_double;
+nb_kernel_t nb_kernel_ElecEwSw_VdwNone_GeomW4W4_F_sparc64_hpc_ace_double;
+nb_kernel_t nb_kernel_ElecCoul_VdwLJ_GeomP1P1_VF_sparc64_hpc_ace_double;
+nb_kernel_t nb_kernel_ElecCoul_VdwLJ_GeomP1P1_F_sparc64_hpc_ace_double;
+nb_kernel_t nb_kernel_ElecCoul_VdwLJ_GeomW3P1_VF_sparc64_hpc_ace_double;
+nb_kernel_t nb_kernel_ElecCoul_VdwLJ_GeomW3P1_F_sparc64_hpc_ace_double;
+nb_kernel_t nb_kernel_ElecCoul_VdwLJ_GeomW3W3_VF_sparc64_hpc_ace_double;
+nb_kernel_t nb_kernel_ElecCoul_VdwLJ_GeomW3W3_F_sparc64_hpc_ace_double;
+nb_kernel_t nb_kernel_ElecCoul_VdwLJ_GeomW4P1_VF_sparc64_hpc_ace_double;
+nb_kernel_t nb_kernel_ElecCoul_VdwLJ_GeomW4P1_F_sparc64_hpc_ace_double;
+nb_kernel_t nb_kernel_ElecCoul_VdwLJ_GeomW4W4_VF_sparc64_hpc_ace_double;
+nb_kernel_t nb_kernel_ElecCoul_VdwLJ_GeomW4W4_F_sparc64_hpc_ace_double;
+nb_kernel_t nb_kernel_ElecCoul_VdwNone_GeomP1P1_VF_sparc64_hpc_ace_double;
+nb_kernel_t nb_kernel_ElecCoul_VdwNone_GeomP1P1_F_sparc64_hpc_ace_double;
+nb_kernel_t nb_kernel_ElecCoul_VdwNone_GeomW3P1_VF_sparc64_hpc_ace_double;
+nb_kernel_t nb_kernel_ElecCoul_VdwNone_GeomW3P1_F_sparc64_hpc_ace_double;
+nb_kernel_t nb_kernel_ElecCoul_VdwNone_GeomW3W3_VF_sparc64_hpc_ace_double;
+nb_kernel_t nb_kernel_ElecCoul_VdwNone_GeomW3W3_F_sparc64_hpc_ace_double;
+nb_kernel_t nb_kernel_ElecCoul_VdwNone_GeomW4P1_VF_sparc64_hpc_ace_double;
+nb_kernel_t nb_kernel_ElecCoul_VdwNone_GeomW4P1_F_sparc64_hpc_ace_double;
+nb_kernel_t nb_kernel_ElecCoul_VdwNone_GeomW4W4_VF_sparc64_hpc_ace_double;
+nb_kernel_t nb_kernel_ElecCoul_VdwNone_GeomW4W4_F_sparc64_hpc_ace_double;
+nb_kernel_t nb_kernel_ElecCoul_VdwCSTab_GeomP1P1_VF_sparc64_hpc_ace_double;
+nb_kernel_t nb_kernel_ElecCoul_VdwCSTab_GeomP1P1_F_sparc64_hpc_ace_double;
+nb_kernel_t nb_kernel_ElecCoul_VdwCSTab_GeomW3P1_VF_sparc64_hpc_ace_double;
+nb_kernel_t nb_kernel_ElecCoul_VdwCSTab_GeomW3P1_F_sparc64_hpc_ace_double;
+nb_kernel_t nb_kernel_ElecCoul_VdwCSTab_GeomW3W3_VF_sparc64_hpc_ace_double;
+nb_kernel_t nb_kernel_ElecCoul_VdwCSTab_GeomW3W3_F_sparc64_hpc_ace_double;
+nb_kernel_t nb_kernel_ElecCoul_VdwCSTab_GeomW4P1_VF_sparc64_hpc_ace_double;
+nb_kernel_t nb_kernel_ElecCoul_VdwCSTab_GeomW4P1_F_sparc64_hpc_ace_double;
+nb_kernel_t nb_kernel_ElecCoul_VdwCSTab_GeomW4W4_VF_sparc64_hpc_ace_double;
+nb_kernel_t nb_kernel_ElecCoul_VdwCSTab_GeomW4W4_F_sparc64_hpc_ace_double;
+nb_kernel_t nb_kernel_ElecCSTab_VdwLJ_GeomP1P1_VF_sparc64_hpc_ace_double;
+nb_kernel_t nb_kernel_ElecCSTab_VdwLJ_GeomP1P1_F_sparc64_hpc_ace_double;
+nb_kernel_t nb_kernel_ElecCSTab_VdwLJ_GeomW3P1_VF_sparc64_hpc_ace_double;
+nb_kernel_t nb_kernel_ElecCSTab_VdwLJ_GeomW3P1_F_sparc64_hpc_ace_double;
+nb_kernel_t nb_kernel_ElecCSTab_VdwLJ_GeomW3W3_VF_sparc64_hpc_ace_double;
+nb_kernel_t nb_kernel_ElecCSTab_VdwLJ_GeomW3W3_F_sparc64_hpc_ace_double;
+nb_kernel_t nb_kernel_ElecCSTab_VdwLJ_GeomW4P1_VF_sparc64_hpc_ace_double;
+nb_kernel_t nb_kernel_ElecCSTab_VdwLJ_GeomW4P1_F_sparc64_hpc_ace_double;
+nb_kernel_t nb_kernel_ElecCSTab_VdwLJ_GeomW4W4_VF_sparc64_hpc_ace_double;
+nb_kernel_t nb_kernel_ElecCSTab_VdwLJ_GeomW4W4_F_sparc64_hpc_ace_double;
+nb_kernel_t nb_kernel_ElecCSTab_VdwNone_GeomP1P1_VF_sparc64_hpc_ace_double;
+nb_kernel_t nb_kernel_ElecCSTab_VdwNone_GeomP1P1_F_sparc64_hpc_ace_double;
+nb_kernel_t nb_kernel_ElecCSTab_VdwNone_GeomW3P1_VF_sparc64_hpc_ace_double;
+nb_kernel_t nb_kernel_ElecCSTab_VdwNone_GeomW3P1_F_sparc64_hpc_ace_double;
+nb_kernel_t nb_kernel_ElecCSTab_VdwNone_GeomW3W3_VF_sparc64_hpc_ace_double;
+nb_kernel_t nb_kernel_ElecCSTab_VdwNone_GeomW3W3_F_sparc64_hpc_ace_double;
+nb_kernel_t nb_kernel_ElecCSTab_VdwNone_GeomW4P1_VF_sparc64_hpc_ace_double;
+nb_kernel_t nb_kernel_ElecCSTab_VdwNone_GeomW4P1_F_sparc64_hpc_ace_double;
+nb_kernel_t nb_kernel_ElecCSTab_VdwNone_GeomW4W4_VF_sparc64_hpc_ace_double;
+nb_kernel_t nb_kernel_ElecCSTab_VdwNone_GeomW4W4_F_sparc64_hpc_ace_double;
+nb_kernel_t nb_kernel_ElecCSTab_VdwCSTab_GeomP1P1_VF_sparc64_hpc_ace_double;
+nb_kernel_t nb_kernel_ElecCSTab_VdwCSTab_GeomP1P1_F_sparc64_hpc_ace_double;
+nb_kernel_t nb_kernel_ElecCSTab_VdwCSTab_GeomW3P1_VF_sparc64_hpc_ace_double;
+nb_kernel_t nb_kernel_ElecCSTab_VdwCSTab_GeomW3P1_F_sparc64_hpc_ace_double;
+nb_kernel_t nb_kernel_ElecCSTab_VdwCSTab_GeomW3W3_VF_sparc64_hpc_ace_double;
+nb_kernel_t nb_kernel_ElecCSTab_VdwCSTab_GeomW3W3_F_sparc64_hpc_ace_double;
+nb_kernel_t nb_kernel_ElecCSTab_VdwCSTab_GeomW4P1_VF_sparc64_hpc_ace_double;
+nb_kernel_t nb_kernel_ElecCSTab_VdwCSTab_GeomW4P1_F_sparc64_hpc_ace_double;
+nb_kernel_t nb_kernel_ElecCSTab_VdwCSTab_GeomW4W4_VF_sparc64_hpc_ace_double;
+nb_kernel_t nb_kernel_ElecCSTab_VdwCSTab_GeomW4W4_F_sparc64_hpc_ace_double;
+nb_kernel_t nb_kernel_ElecGB_VdwLJ_GeomP1P1_VF_sparc64_hpc_ace_double;
+nb_kernel_t nb_kernel_ElecGB_VdwLJ_GeomP1P1_F_sparc64_hpc_ace_double;
+nb_kernel_t nb_kernel_ElecGB_VdwNone_GeomP1P1_VF_sparc64_hpc_ace_double;
+nb_kernel_t nb_kernel_ElecGB_VdwNone_GeomP1P1_F_sparc64_hpc_ace_double;
+nb_kernel_t nb_kernel_ElecGB_VdwCSTab_GeomP1P1_VF_sparc64_hpc_ace_double;
+nb_kernel_t nb_kernel_ElecGB_VdwCSTab_GeomP1P1_F_sparc64_hpc_ace_double;
+nb_kernel_t nb_kernel_ElecRFCut_VdwLJSh_GeomP1P1_VF_sparc64_hpc_ace_double;
+nb_kernel_t nb_kernel_ElecRFCut_VdwLJSh_GeomP1P1_F_sparc64_hpc_ace_double;
+nb_kernel_t nb_kernel_ElecRFCut_VdwLJSh_GeomW3P1_VF_sparc64_hpc_ace_double;
+nb_kernel_t nb_kernel_ElecRFCut_VdwLJSh_GeomW3P1_F_sparc64_hpc_ace_double;
+nb_kernel_t nb_kernel_ElecRFCut_VdwLJSh_GeomW3W3_VF_sparc64_hpc_ace_double;
+nb_kernel_t nb_kernel_ElecRFCut_VdwLJSh_GeomW3W3_F_sparc64_hpc_ace_double;
+nb_kernel_t nb_kernel_ElecRFCut_VdwLJSh_GeomW4P1_VF_sparc64_hpc_ace_double;
+nb_kernel_t nb_kernel_ElecRFCut_VdwLJSh_GeomW4P1_F_sparc64_hpc_ace_double;
+nb_kernel_t nb_kernel_ElecRFCut_VdwLJSh_GeomW4W4_VF_sparc64_hpc_ace_double;
+nb_kernel_t nb_kernel_ElecRFCut_VdwLJSh_GeomW4W4_F_sparc64_hpc_ace_double;
+nb_kernel_t nb_kernel_ElecRFCut_VdwLJSw_GeomP1P1_VF_sparc64_hpc_ace_double;
+nb_kernel_t nb_kernel_ElecRFCut_VdwLJSw_GeomP1P1_F_sparc64_hpc_ace_double;
+nb_kernel_t nb_kernel_ElecRFCut_VdwLJSw_GeomW3P1_VF_sparc64_hpc_ace_double;
+nb_kernel_t nb_kernel_ElecRFCut_VdwLJSw_GeomW3P1_F_sparc64_hpc_ace_double;
+nb_kernel_t nb_kernel_ElecRFCut_VdwLJSw_GeomW3W3_VF_sparc64_hpc_ace_double;
+nb_kernel_t nb_kernel_ElecRFCut_VdwLJSw_GeomW3W3_F_sparc64_hpc_ace_double;
+nb_kernel_t nb_kernel_ElecRFCut_VdwLJSw_GeomW4P1_VF_sparc64_hpc_ace_double;
+nb_kernel_t nb_kernel_ElecRFCut_VdwLJSw_GeomW4P1_F_sparc64_hpc_ace_double;
+nb_kernel_t nb_kernel_ElecRFCut_VdwLJSw_GeomW4W4_VF_sparc64_hpc_ace_double;
+nb_kernel_t nb_kernel_ElecRFCut_VdwLJSw_GeomW4W4_F_sparc64_hpc_ace_double;
+nb_kernel_t nb_kernel_ElecRFCut_VdwNone_GeomP1P1_VF_sparc64_hpc_ace_double;
+nb_kernel_t nb_kernel_ElecRFCut_VdwNone_GeomP1P1_F_sparc64_hpc_ace_double;
+nb_kernel_t nb_kernel_ElecRFCut_VdwNone_GeomW3P1_VF_sparc64_hpc_ace_double;
+nb_kernel_t nb_kernel_ElecRFCut_VdwNone_GeomW3P1_F_sparc64_hpc_ace_double;
+nb_kernel_t nb_kernel_ElecRFCut_VdwNone_GeomW3W3_VF_sparc64_hpc_ace_double;
+nb_kernel_t nb_kernel_ElecRFCut_VdwNone_GeomW3W3_F_sparc64_hpc_ace_double;
+nb_kernel_t nb_kernel_ElecRFCut_VdwNone_GeomW4P1_VF_sparc64_hpc_ace_double;
+nb_kernel_t nb_kernel_ElecRFCut_VdwNone_GeomW4P1_F_sparc64_hpc_ace_double;
+nb_kernel_t nb_kernel_ElecRFCut_VdwNone_GeomW4W4_VF_sparc64_hpc_ace_double;
+nb_kernel_t nb_kernel_ElecRFCut_VdwNone_GeomW4W4_F_sparc64_hpc_ace_double;
+nb_kernel_t nb_kernel_ElecRFCut_VdwCSTab_GeomP1P1_VF_sparc64_hpc_ace_double;
+nb_kernel_t nb_kernel_ElecRFCut_VdwCSTab_GeomP1P1_F_sparc64_hpc_ace_double;
+nb_kernel_t nb_kernel_ElecRFCut_VdwCSTab_GeomW3P1_VF_sparc64_hpc_ace_double;
+nb_kernel_t nb_kernel_ElecRFCut_VdwCSTab_GeomW3P1_F_sparc64_hpc_ace_double;
+nb_kernel_t nb_kernel_ElecRFCut_VdwCSTab_GeomW3W3_VF_sparc64_hpc_ace_double;
+nb_kernel_t nb_kernel_ElecRFCut_VdwCSTab_GeomW3W3_F_sparc64_hpc_ace_double;
+nb_kernel_t nb_kernel_ElecRFCut_VdwCSTab_GeomW4P1_VF_sparc64_hpc_ace_double;
+nb_kernel_t nb_kernel_ElecRFCut_VdwCSTab_GeomW4P1_F_sparc64_hpc_ace_double;
+nb_kernel_t nb_kernel_ElecRFCut_VdwCSTab_GeomW4W4_VF_sparc64_hpc_ace_double;
+nb_kernel_t nb_kernel_ElecRFCut_VdwCSTab_GeomW4W4_F_sparc64_hpc_ace_double;
+nb_kernel_t nb_kernel_ElecRF_VdwLJ_GeomP1P1_VF_sparc64_hpc_ace_double;
+nb_kernel_t nb_kernel_ElecRF_VdwLJ_GeomP1P1_F_sparc64_hpc_ace_double;
+nb_kernel_t nb_kernel_ElecRF_VdwLJ_GeomW3P1_VF_sparc64_hpc_ace_double;
+nb_kernel_t nb_kernel_ElecRF_VdwLJ_GeomW3P1_F_sparc64_hpc_ace_double;
+nb_kernel_t nb_kernel_ElecRF_VdwLJ_GeomW3W3_VF_sparc64_hpc_ace_double;
+nb_kernel_t nb_kernel_ElecRF_VdwLJ_GeomW3W3_F_sparc64_hpc_ace_double;
+nb_kernel_t nb_kernel_ElecRF_VdwLJ_GeomW4P1_VF_sparc64_hpc_ace_double;
+nb_kernel_t nb_kernel_ElecRF_VdwLJ_GeomW4P1_F_sparc64_hpc_ace_double;
+nb_kernel_t nb_kernel_ElecRF_VdwLJ_GeomW4W4_VF_sparc64_hpc_ace_double;
+nb_kernel_t nb_kernel_ElecRF_VdwLJ_GeomW4W4_F_sparc64_hpc_ace_double;
+nb_kernel_t nb_kernel_ElecRF_VdwNone_GeomP1P1_VF_sparc64_hpc_ace_double;
+nb_kernel_t nb_kernel_ElecRF_VdwNone_GeomP1P1_F_sparc64_hpc_ace_double;
+nb_kernel_t nb_kernel_ElecRF_VdwNone_GeomW3P1_VF_sparc64_hpc_ace_double;
+nb_kernel_t nb_kernel_ElecRF_VdwNone_GeomW3P1_F_sparc64_hpc_ace_double;
+nb_kernel_t nb_kernel_ElecRF_VdwNone_GeomW3W3_VF_sparc64_hpc_ace_double;
+nb_kernel_t nb_kernel_ElecRF_VdwNone_GeomW3W3_F_sparc64_hpc_ace_double;
+nb_kernel_t nb_kernel_ElecRF_VdwNone_GeomW4P1_VF_sparc64_hpc_ace_double;
+nb_kernel_t nb_kernel_ElecRF_VdwNone_GeomW4P1_F_sparc64_hpc_ace_double;
+nb_kernel_t nb_kernel_ElecRF_VdwNone_GeomW4W4_VF_sparc64_hpc_ace_double;
+nb_kernel_t nb_kernel_ElecRF_VdwNone_GeomW4W4_F_sparc64_hpc_ace_double;
+nb_kernel_t nb_kernel_ElecRF_VdwCSTab_GeomP1P1_VF_sparc64_hpc_ace_double;
+nb_kernel_t nb_kernel_ElecRF_VdwCSTab_GeomP1P1_F_sparc64_hpc_ace_double;
+nb_kernel_t nb_kernel_ElecRF_VdwCSTab_GeomW3P1_VF_sparc64_hpc_ace_double;
+nb_kernel_t nb_kernel_ElecRF_VdwCSTab_GeomW3P1_F_sparc64_hpc_ace_double;
+nb_kernel_t nb_kernel_ElecRF_VdwCSTab_GeomW3W3_VF_sparc64_hpc_ace_double;
+nb_kernel_t nb_kernel_ElecRF_VdwCSTab_GeomW3W3_F_sparc64_hpc_ace_double;
+nb_kernel_t nb_kernel_ElecRF_VdwCSTab_GeomW4P1_VF_sparc64_hpc_ace_double;
+nb_kernel_t nb_kernel_ElecRF_VdwCSTab_GeomW4P1_F_sparc64_hpc_ace_double;
+nb_kernel_t nb_kernel_ElecRF_VdwCSTab_GeomW4W4_VF_sparc64_hpc_ace_double;
+nb_kernel_t nb_kernel_ElecRF_VdwCSTab_GeomW4W4_F_sparc64_hpc_ace_double;
+
+
+nb_kernel_info_t
+kernellist_sparc64_hpc_ace_double[] =
+{
+    { nb_kernel_ElecNone_VdwLJ_GeomP1P1_VF_sparc64_hpc_ace_double, "nb_kernel_ElecNone_VdwLJ_GeomP1P1_VF_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "None", "None", "LennardJones", "None", "ParticleParticle", "", "PotentialAndForce" },
+    { nb_kernel_ElecNone_VdwLJ_GeomP1P1_F_sparc64_hpc_ace_double, "nb_kernel_ElecNone_VdwLJ_GeomP1P1_F_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "None", "None", "LennardJones", "None", "ParticleParticle", "", "Force" },
+    { nb_kernel_ElecNone_VdwLJSh_GeomP1P1_VF_sparc64_hpc_ace_double, "nb_kernel_ElecNone_VdwLJSh_GeomP1P1_VF_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "None", "None", "LennardJones", "PotentialShift", "ParticleParticle", "", "PotentialAndForce" },
+    { nb_kernel_ElecNone_VdwLJSh_GeomP1P1_F_sparc64_hpc_ace_double, "nb_kernel_ElecNone_VdwLJSh_GeomP1P1_F_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "None", "None", "LennardJones", "PotentialShift", "ParticleParticle", "", "Force" },
+    { nb_kernel_ElecNone_VdwLJSw_GeomP1P1_VF_sparc64_hpc_ace_double, "nb_kernel_ElecNone_VdwLJSw_GeomP1P1_VF_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "None", "None", "LennardJones", "PotentialSwitch", "ParticleParticle", "", "PotentialAndForce" },
+    { nb_kernel_ElecNone_VdwLJSw_GeomP1P1_F_sparc64_hpc_ace_double, "nb_kernel_ElecNone_VdwLJSw_GeomP1P1_F_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "None", "None", "LennardJones", "PotentialSwitch", "ParticleParticle", "", "Force" },
+    { nb_kernel_ElecNone_VdwCSTab_GeomP1P1_VF_sparc64_hpc_ace_double, "nb_kernel_ElecNone_VdwCSTab_GeomP1P1_VF_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "None", "None", "CubicSplineTable", "None", "ParticleParticle", "", "PotentialAndForce" },
+    { nb_kernel_ElecNone_VdwCSTab_GeomP1P1_F_sparc64_hpc_ace_double, "nb_kernel_ElecNone_VdwCSTab_GeomP1P1_F_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "None", "None", "CubicSplineTable", "None", "ParticleParticle", "", "Force" },
+    { nb_kernel_ElecEw_VdwLJ_GeomP1P1_VF_sparc64_hpc_ace_double, "nb_kernel_ElecEw_VdwLJ_GeomP1P1_VF_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "Ewald", "None", "LennardJones", "None", "ParticleParticle", "", "PotentialAndForce" },
+    { nb_kernel_ElecEw_VdwLJ_GeomP1P1_F_sparc64_hpc_ace_double, "nb_kernel_ElecEw_VdwLJ_GeomP1P1_F_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "Ewald", "None", "LennardJones", "None", "ParticleParticle", "", "Force" },
+    { nb_kernel_ElecEw_VdwLJ_GeomW3P1_VF_sparc64_hpc_ace_double, "nb_kernel_ElecEw_VdwLJ_GeomW3P1_VF_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "Ewald", "None", "LennardJones", "None", "Water3Particle", "", "PotentialAndForce" },
+    { nb_kernel_ElecEw_VdwLJ_GeomW3P1_F_sparc64_hpc_ace_double, "nb_kernel_ElecEw_VdwLJ_GeomW3P1_F_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "Ewald", "None", "LennardJones", "None", "Water3Particle", "", "Force" },
+    { nb_kernel_ElecEw_VdwLJ_GeomW3W3_VF_sparc64_hpc_ace_double, "nb_kernel_ElecEw_VdwLJ_GeomW3W3_VF_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "Ewald", "None", "LennardJones", "None", "Water3Water3", "", "PotentialAndForce" },
+    { nb_kernel_ElecEw_VdwLJ_GeomW3W3_F_sparc64_hpc_ace_double, "nb_kernel_ElecEw_VdwLJ_GeomW3W3_F_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "Ewald", "None", "LennardJones", "None", "Water3Water3", "", "Force" },
+    { nb_kernel_ElecEw_VdwLJ_GeomW4P1_VF_sparc64_hpc_ace_double, "nb_kernel_ElecEw_VdwLJ_GeomW4P1_VF_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "Ewald", "None", "LennardJones", "None", "Water4Particle", "", "PotentialAndForce" },
+    { nb_kernel_ElecEw_VdwLJ_GeomW4P1_F_sparc64_hpc_ace_double, "nb_kernel_ElecEw_VdwLJ_GeomW4P1_F_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "Ewald", "None", "LennardJones", "None", "Water4Particle", "", "Force" },
+    { nb_kernel_ElecEw_VdwLJ_GeomW4W4_VF_sparc64_hpc_ace_double, "nb_kernel_ElecEw_VdwLJ_GeomW4W4_VF_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "Ewald", "None", "LennardJones", "None", "Water4Water4", "", "PotentialAndForce" },
+    { nb_kernel_ElecEw_VdwLJ_GeomW4W4_F_sparc64_hpc_ace_double, "nb_kernel_ElecEw_VdwLJ_GeomW4W4_F_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "Ewald", "None", "LennardJones", "None", "Water4Water4", "", "Force" },
+    { nb_kernel_ElecEw_VdwNone_GeomP1P1_VF_sparc64_hpc_ace_double, "nb_kernel_ElecEw_VdwNone_GeomP1P1_VF_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "Ewald", "None", "None", "None", "ParticleParticle", "", "PotentialAndForce" },
+    { nb_kernel_ElecEw_VdwNone_GeomP1P1_F_sparc64_hpc_ace_double, "nb_kernel_ElecEw_VdwNone_GeomP1P1_F_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "Ewald", "None", "None", "None", "ParticleParticle", "", "Force" },
+    { nb_kernel_ElecEw_VdwNone_GeomW3P1_VF_sparc64_hpc_ace_double, "nb_kernel_ElecEw_VdwNone_GeomW3P1_VF_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "Ewald", "None", "None", "None", "Water3Particle", "", "PotentialAndForce" },
+    { nb_kernel_ElecEw_VdwNone_GeomW3P1_F_sparc64_hpc_ace_double, "nb_kernel_ElecEw_VdwNone_GeomW3P1_F_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "Ewald", "None", "None", "None", "Water3Particle", "", "Force" },
+    { nb_kernel_ElecEw_VdwNone_GeomW3W3_VF_sparc64_hpc_ace_double, "nb_kernel_ElecEw_VdwNone_GeomW3W3_VF_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "Ewald", "None", "None", "None", "Water3Water3", "", "PotentialAndForce" },
+    { nb_kernel_ElecEw_VdwNone_GeomW3W3_F_sparc64_hpc_ace_double, "nb_kernel_ElecEw_VdwNone_GeomW3W3_F_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "Ewald", "None", "None", "None", "Water3Water3", "", "Force" },
+    { nb_kernel_ElecEw_VdwNone_GeomW4P1_VF_sparc64_hpc_ace_double, "nb_kernel_ElecEw_VdwNone_GeomW4P1_VF_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "Ewald", "None", "None", "None", "Water4Particle", "", "PotentialAndForce" },
+    { nb_kernel_ElecEw_VdwNone_GeomW4P1_F_sparc64_hpc_ace_double, "nb_kernel_ElecEw_VdwNone_GeomW4P1_F_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "Ewald", "None", "None", "None", "Water4Particle", "", "Force" },
+    { nb_kernel_ElecEw_VdwNone_GeomW4W4_VF_sparc64_hpc_ace_double, "nb_kernel_ElecEw_VdwNone_GeomW4W4_VF_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "Ewald", "None", "None", "None", "Water4Water4", "", "PotentialAndForce" },
+    { nb_kernel_ElecEw_VdwNone_GeomW4W4_F_sparc64_hpc_ace_double, "nb_kernel_ElecEw_VdwNone_GeomW4W4_F_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "Ewald", "None", "None", "None", "Water4Water4", "", "Force" },
+    { nb_kernel_ElecEw_VdwCSTab_GeomP1P1_VF_sparc64_hpc_ace_double, "nb_kernel_ElecEw_VdwCSTab_GeomP1P1_VF_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "Ewald", "None", "CubicSplineTable", "None", "ParticleParticle", "", "PotentialAndForce" },
+    { nb_kernel_ElecEw_VdwCSTab_GeomP1P1_F_sparc64_hpc_ace_double, "nb_kernel_ElecEw_VdwCSTab_GeomP1P1_F_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "Ewald", "None", "CubicSplineTable", "None", "ParticleParticle", "", "Force" },
+    { nb_kernel_ElecEw_VdwCSTab_GeomW3P1_VF_sparc64_hpc_ace_double, "nb_kernel_ElecEw_VdwCSTab_GeomW3P1_VF_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "Ewald", "None", "CubicSplineTable", "None", "Water3Particle", "", "PotentialAndForce" },
+    { nb_kernel_ElecEw_VdwCSTab_GeomW3P1_F_sparc64_hpc_ace_double, "nb_kernel_ElecEw_VdwCSTab_GeomW3P1_F_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "Ewald", "None", "CubicSplineTable", "None", "Water3Particle", "", "Force" },
+    { nb_kernel_ElecEw_VdwCSTab_GeomW3W3_VF_sparc64_hpc_ace_double, "nb_kernel_ElecEw_VdwCSTab_GeomW3W3_VF_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "Ewald", "None", "CubicSplineTable", "None", "Water3Water3", "", "PotentialAndForce" },
+    { nb_kernel_ElecEw_VdwCSTab_GeomW3W3_F_sparc64_hpc_ace_double, "nb_kernel_ElecEw_VdwCSTab_GeomW3W3_F_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "Ewald", "None", "CubicSplineTable", "None", "Water3Water3", "", "Force" },
+    { nb_kernel_ElecEw_VdwCSTab_GeomW4P1_VF_sparc64_hpc_ace_double, "nb_kernel_ElecEw_VdwCSTab_GeomW4P1_VF_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "Ewald", "None", "CubicSplineTable", "None", "Water4Particle", "", "PotentialAndForce" },
+    { nb_kernel_ElecEw_VdwCSTab_GeomW4P1_F_sparc64_hpc_ace_double, "nb_kernel_ElecEw_VdwCSTab_GeomW4P1_F_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "Ewald", "None", "CubicSplineTable", "None", "Water4Particle", "", "Force" },
+    { nb_kernel_ElecEw_VdwCSTab_GeomW4W4_VF_sparc64_hpc_ace_double, "nb_kernel_ElecEw_VdwCSTab_GeomW4W4_VF_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "Ewald", "None", "CubicSplineTable", "None", "Water4Water4", "", "PotentialAndForce" },
+    { nb_kernel_ElecEw_VdwCSTab_GeomW4W4_F_sparc64_hpc_ace_double, "nb_kernel_ElecEw_VdwCSTab_GeomW4W4_F_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "Ewald", "None", "CubicSplineTable", "None", "Water4Water4", "", "Force" },
+    { nb_kernel_ElecEwSh_VdwLJSh_GeomP1P1_VF_sparc64_hpc_ace_double, "nb_kernel_ElecEwSh_VdwLJSh_GeomP1P1_VF_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "Ewald", "PotentialShift", "LennardJones", "PotentialShift", "ParticleParticle", "", "PotentialAndForce" },
+    { nb_kernel_ElecEwSh_VdwLJSh_GeomP1P1_F_sparc64_hpc_ace_double, "nb_kernel_ElecEwSh_VdwLJSh_GeomP1P1_F_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "Ewald", "PotentialShift", "LennardJones", "PotentialShift", "ParticleParticle", "", "Force" },
+    { nb_kernel_ElecEwSh_VdwLJSh_GeomW3P1_VF_sparc64_hpc_ace_double, "nb_kernel_ElecEwSh_VdwLJSh_GeomW3P1_VF_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "Ewald", "PotentialShift", "LennardJones", "PotentialShift", "Water3Particle", "", "PotentialAndForce" },
+    { nb_kernel_ElecEwSh_VdwLJSh_GeomW3P1_F_sparc64_hpc_ace_double, "nb_kernel_ElecEwSh_VdwLJSh_GeomW3P1_F_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "Ewald", "PotentialShift", "LennardJones", "PotentialShift", "Water3Particle", "", "Force" },
+    { nb_kernel_ElecEwSh_VdwLJSh_GeomW3W3_VF_sparc64_hpc_ace_double, "nb_kernel_ElecEwSh_VdwLJSh_GeomW3W3_VF_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "Ewald", "PotentialShift", "LennardJones", "PotentialShift", "Water3Water3", "", "PotentialAndForce" },
+    { nb_kernel_ElecEwSh_VdwLJSh_GeomW3W3_F_sparc64_hpc_ace_double, "nb_kernel_ElecEwSh_VdwLJSh_GeomW3W3_F_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "Ewald", "PotentialShift", "LennardJones", "PotentialShift", "Water3Water3", "", "Force" },
+    { nb_kernel_ElecEwSh_VdwLJSh_GeomW4P1_VF_sparc64_hpc_ace_double, "nb_kernel_ElecEwSh_VdwLJSh_GeomW4P1_VF_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "Ewald", "PotentialShift", "LennardJones", "PotentialShift", "Water4Particle", "", "PotentialAndForce" },
+    { nb_kernel_ElecEwSh_VdwLJSh_GeomW4P1_F_sparc64_hpc_ace_double, "nb_kernel_ElecEwSh_VdwLJSh_GeomW4P1_F_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "Ewald", "PotentialShift", "LennardJones", "PotentialShift", "Water4Particle", "", "Force" },
+    { nb_kernel_ElecEwSh_VdwLJSh_GeomW4W4_VF_sparc64_hpc_ace_double, "nb_kernel_ElecEwSh_VdwLJSh_GeomW4W4_VF_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "Ewald", "PotentialShift", "LennardJones", "PotentialShift", "Water4Water4", "", "PotentialAndForce" },
+    { nb_kernel_ElecEwSh_VdwLJSh_GeomW4W4_F_sparc64_hpc_ace_double, "nb_kernel_ElecEwSh_VdwLJSh_GeomW4W4_F_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "Ewald", "PotentialShift", "LennardJones", "PotentialShift", "Water4Water4", "", "Force" },
+    { nb_kernel_ElecEwSh_VdwNone_GeomP1P1_VF_sparc64_hpc_ace_double, "nb_kernel_ElecEwSh_VdwNone_GeomP1P1_VF_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "Ewald", "PotentialShift", "None", "None", "ParticleParticle", "", "PotentialAndForce" },
+    { nb_kernel_ElecEwSh_VdwNone_GeomP1P1_F_sparc64_hpc_ace_double, "nb_kernel_ElecEwSh_VdwNone_GeomP1P1_F_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "Ewald", "PotentialShift", "None", "None", "ParticleParticle", "", "Force" },
+    { nb_kernel_ElecEwSh_VdwNone_GeomW3P1_VF_sparc64_hpc_ace_double, "nb_kernel_ElecEwSh_VdwNone_GeomW3P1_VF_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "Ewald", "PotentialShift", "None", "None", "Water3Particle", "", "PotentialAndForce" },
+    { nb_kernel_ElecEwSh_VdwNone_GeomW3P1_F_sparc64_hpc_ace_double, "nb_kernel_ElecEwSh_VdwNone_GeomW3P1_F_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "Ewald", "PotentialShift", "None", "None", "Water3Particle", "", "Force" },
+    { nb_kernel_ElecEwSh_VdwNone_GeomW3W3_VF_sparc64_hpc_ace_double, "nb_kernel_ElecEwSh_VdwNone_GeomW3W3_VF_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "Ewald", "PotentialShift", "None", "None", "Water3Water3", "", "PotentialAndForce" },
+    { nb_kernel_ElecEwSh_VdwNone_GeomW3W3_F_sparc64_hpc_ace_double, "nb_kernel_ElecEwSh_VdwNone_GeomW3W3_F_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "Ewald", "PotentialShift", "None", "None", "Water3Water3", "", "Force" },
+    { nb_kernel_ElecEwSh_VdwNone_GeomW4P1_VF_sparc64_hpc_ace_double, "nb_kernel_ElecEwSh_VdwNone_GeomW4P1_VF_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "Ewald", "PotentialShift", "None", "None", "Water4Particle", "", "PotentialAndForce" },
+    { nb_kernel_ElecEwSh_VdwNone_GeomW4P1_F_sparc64_hpc_ace_double, "nb_kernel_ElecEwSh_VdwNone_GeomW4P1_F_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "Ewald", "PotentialShift", "None", "None", "Water4Particle", "", "Force" },
+    { nb_kernel_ElecEwSh_VdwNone_GeomW4W4_VF_sparc64_hpc_ace_double, "nb_kernel_ElecEwSh_VdwNone_GeomW4W4_VF_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "Ewald", "PotentialShift", "None", "None", "Water4Water4", "", "PotentialAndForce" },
+    { nb_kernel_ElecEwSh_VdwNone_GeomW4W4_F_sparc64_hpc_ace_double, "nb_kernel_ElecEwSh_VdwNone_GeomW4W4_F_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "Ewald", "PotentialShift", "None", "None", "Water4Water4", "", "Force" },
+    { nb_kernel_ElecEwSw_VdwLJSw_GeomP1P1_VF_sparc64_hpc_ace_double, "nb_kernel_ElecEwSw_VdwLJSw_GeomP1P1_VF_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "Ewald", "PotentialSwitch", "LennardJones", "PotentialSwitch", "ParticleParticle", "", "PotentialAndForce" },
+    { nb_kernel_ElecEwSw_VdwLJSw_GeomP1P1_F_sparc64_hpc_ace_double, "nb_kernel_ElecEwSw_VdwLJSw_GeomP1P1_F_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "Ewald", "PotentialSwitch", "LennardJones", "PotentialSwitch", "ParticleParticle", "", "Force" },
+    { nb_kernel_ElecEwSw_VdwLJSw_GeomW3P1_VF_sparc64_hpc_ace_double, "nb_kernel_ElecEwSw_VdwLJSw_GeomW3P1_VF_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "Ewald", "PotentialSwitch", "LennardJones", "PotentialSwitch", "Water3Particle", "", "PotentialAndForce" },
+    { nb_kernel_ElecEwSw_VdwLJSw_GeomW3P1_F_sparc64_hpc_ace_double, "nb_kernel_ElecEwSw_VdwLJSw_GeomW3P1_F_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "Ewald", "PotentialSwitch", "LennardJones", "PotentialSwitch", "Water3Particle", "", "Force" },
+    { nb_kernel_ElecEwSw_VdwLJSw_GeomW3W3_VF_sparc64_hpc_ace_double, "nb_kernel_ElecEwSw_VdwLJSw_GeomW3W3_VF_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "Ewald", "PotentialSwitch", "LennardJones", "PotentialSwitch", "Water3Water3", "", "PotentialAndForce" },
+    { nb_kernel_ElecEwSw_VdwLJSw_GeomW3W3_F_sparc64_hpc_ace_double, "nb_kernel_ElecEwSw_VdwLJSw_GeomW3W3_F_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "Ewald", "PotentialSwitch", "LennardJones", "PotentialSwitch", "Water3Water3", "", "Force" },
+    { nb_kernel_ElecEwSw_VdwLJSw_GeomW4P1_VF_sparc64_hpc_ace_double, "nb_kernel_ElecEwSw_VdwLJSw_GeomW4P1_VF_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "Ewald", "PotentialSwitch", "LennardJones", "PotentialSwitch", "Water4Particle", "", "PotentialAndForce" },
+    { nb_kernel_ElecEwSw_VdwLJSw_GeomW4P1_F_sparc64_hpc_ace_double, "nb_kernel_ElecEwSw_VdwLJSw_GeomW4P1_F_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "Ewald", "PotentialSwitch", "LennardJones", "PotentialSwitch", "Water4Particle", "", "Force" },
+    { nb_kernel_ElecEwSw_VdwLJSw_GeomW4W4_VF_sparc64_hpc_ace_double, "nb_kernel_ElecEwSw_VdwLJSw_GeomW4W4_VF_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "Ewald", "PotentialSwitch", "LennardJones", "PotentialSwitch", "Water4Water4", "", "PotentialAndForce" },
+    { nb_kernel_ElecEwSw_VdwLJSw_GeomW4W4_F_sparc64_hpc_ace_double, "nb_kernel_ElecEwSw_VdwLJSw_GeomW4W4_F_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "Ewald", "PotentialSwitch", "LennardJones", "PotentialSwitch", "Water4Water4", "", "Force" },
+    { nb_kernel_ElecEwSw_VdwNone_GeomP1P1_VF_sparc64_hpc_ace_double, "nb_kernel_ElecEwSw_VdwNone_GeomP1P1_VF_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "Ewald", "PotentialSwitch", "None", "None", "ParticleParticle", "", "PotentialAndForce" },
+    { nb_kernel_ElecEwSw_VdwNone_GeomP1P1_F_sparc64_hpc_ace_double, "nb_kernel_ElecEwSw_VdwNone_GeomP1P1_F_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "Ewald", "PotentialSwitch", "None", "None", "ParticleParticle", "", "Force" },
+    { nb_kernel_ElecEwSw_VdwNone_GeomW3P1_VF_sparc64_hpc_ace_double, "nb_kernel_ElecEwSw_VdwNone_GeomW3P1_VF_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "Ewald", "PotentialSwitch", "None", "None", "Water3Particle", "", "PotentialAndForce" },
+    { nb_kernel_ElecEwSw_VdwNone_GeomW3P1_F_sparc64_hpc_ace_double, "nb_kernel_ElecEwSw_VdwNone_GeomW3P1_F_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "Ewald", "PotentialSwitch", "None", "None", "Water3Particle", "", "Force" },
+    { nb_kernel_ElecEwSw_VdwNone_GeomW3W3_VF_sparc64_hpc_ace_double, "nb_kernel_ElecEwSw_VdwNone_GeomW3W3_VF_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "Ewald", "PotentialSwitch", "None", "None", "Water3Water3", "", "PotentialAndForce" },
+    { nb_kernel_ElecEwSw_VdwNone_GeomW3W3_F_sparc64_hpc_ace_double, "nb_kernel_ElecEwSw_VdwNone_GeomW3W3_F_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "Ewald", "PotentialSwitch", "None", "None", "Water3Water3", "", "Force" },
+    { nb_kernel_ElecEwSw_VdwNone_GeomW4P1_VF_sparc64_hpc_ace_double, "nb_kernel_ElecEwSw_VdwNone_GeomW4P1_VF_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "Ewald", "PotentialSwitch", "None", "None", "Water4Particle", "", "PotentialAndForce" },
+    { nb_kernel_ElecEwSw_VdwNone_GeomW4P1_F_sparc64_hpc_ace_double, "nb_kernel_ElecEwSw_VdwNone_GeomW4P1_F_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "Ewald", "PotentialSwitch", "None", "None", "Water4Particle", "", "Force" },
+    { nb_kernel_ElecEwSw_VdwNone_GeomW4W4_VF_sparc64_hpc_ace_double, "nb_kernel_ElecEwSw_VdwNone_GeomW4W4_VF_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "Ewald", "PotentialSwitch", "None", "None", "Water4Water4", "", "PotentialAndForce" },
+    { nb_kernel_ElecEwSw_VdwNone_GeomW4W4_F_sparc64_hpc_ace_double, "nb_kernel_ElecEwSw_VdwNone_GeomW4W4_F_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "Ewald", "PotentialSwitch", "None", "None", "Water4Water4", "", "Force" },
+    { nb_kernel_ElecCoul_VdwLJ_GeomP1P1_VF_sparc64_hpc_ace_double, "nb_kernel_ElecCoul_VdwLJ_GeomP1P1_VF_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "Coulomb", "None", "LennardJones", "None", "ParticleParticle", "", "PotentialAndForce" },
+    { nb_kernel_ElecCoul_VdwLJ_GeomP1P1_F_sparc64_hpc_ace_double, "nb_kernel_ElecCoul_VdwLJ_GeomP1P1_F_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "Coulomb", "None", "LennardJones", "None", "ParticleParticle", "", "Force" },
+    { nb_kernel_ElecCoul_VdwLJ_GeomW3P1_VF_sparc64_hpc_ace_double, "nb_kernel_ElecCoul_VdwLJ_GeomW3P1_VF_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "Coulomb", "None", "LennardJones", "None", "Water3Particle", "", "PotentialAndForce" },
+    { nb_kernel_ElecCoul_VdwLJ_GeomW3P1_F_sparc64_hpc_ace_double, "nb_kernel_ElecCoul_VdwLJ_GeomW3P1_F_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "Coulomb", "None", "LennardJones", "None", "Water3Particle", "", "Force" },
+    { nb_kernel_ElecCoul_VdwLJ_GeomW3W3_VF_sparc64_hpc_ace_double, "nb_kernel_ElecCoul_VdwLJ_GeomW3W3_VF_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "Coulomb", "None", "LennardJones", "None", "Water3Water3", "", "PotentialAndForce" },
+    { nb_kernel_ElecCoul_VdwLJ_GeomW3W3_F_sparc64_hpc_ace_double, "nb_kernel_ElecCoul_VdwLJ_GeomW3W3_F_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "Coulomb", "None", "LennardJones", "None", "Water3Water3", "", "Force" },
+    { nb_kernel_ElecCoul_VdwLJ_GeomW4P1_VF_sparc64_hpc_ace_double, "nb_kernel_ElecCoul_VdwLJ_GeomW4P1_VF_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "Coulomb", "None", "LennardJones", "None", "Water4Particle", "", "PotentialAndForce" },
+    { nb_kernel_ElecCoul_VdwLJ_GeomW4P1_F_sparc64_hpc_ace_double, "nb_kernel_ElecCoul_VdwLJ_GeomW4P1_F_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "Coulomb", "None", "LennardJones", "None", "Water4Particle", "", "Force" },
+    { nb_kernel_ElecCoul_VdwLJ_GeomW4W4_VF_sparc64_hpc_ace_double, "nb_kernel_ElecCoul_VdwLJ_GeomW4W4_VF_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "Coulomb", "None", "LennardJones", "None", "Water4Water4", "", "PotentialAndForce" },
+    { nb_kernel_ElecCoul_VdwLJ_GeomW4W4_F_sparc64_hpc_ace_double, "nb_kernel_ElecCoul_VdwLJ_GeomW4W4_F_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "Coulomb", "None", "LennardJones", "None", "Water4Water4", "", "Force" },
+    { nb_kernel_ElecCoul_VdwNone_GeomP1P1_VF_sparc64_hpc_ace_double, "nb_kernel_ElecCoul_VdwNone_GeomP1P1_VF_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "Coulomb", "None", "None", "None", "ParticleParticle", "", "PotentialAndForce" },
+    { nb_kernel_ElecCoul_VdwNone_GeomP1P1_F_sparc64_hpc_ace_double, "nb_kernel_ElecCoul_VdwNone_GeomP1P1_F_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "Coulomb", "None", "None", "None", "ParticleParticle", "", "Force" },
+    { nb_kernel_ElecCoul_VdwNone_GeomW3P1_VF_sparc64_hpc_ace_double, "nb_kernel_ElecCoul_VdwNone_GeomW3P1_VF_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "Coulomb", "None", "None", "None", "Water3Particle", "", "PotentialAndForce" },
+    { nb_kernel_ElecCoul_VdwNone_GeomW3P1_F_sparc64_hpc_ace_double, "nb_kernel_ElecCoul_VdwNone_GeomW3P1_F_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "Coulomb", "None", "None", "None", "Water3Particle", "", "Force" },
+    { nb_kernel_ElecCoul_VdwNone_GeomW3W3_VF_sparc64_hpc_ace_double, "nb_kernel_ElecCoul_VdwNone_GeomW3W3_VF_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "Coulomb", "None", "None", "None", "Water3Water3", "", "PotentialAndForce" },
+    { nb_kernel_ElecCoul_VdwNone_GeomW3W3_F_sparc64_hpc_ace_double, "nb_kernel_ElecCoul_VdwNone_GeomW3W3_F_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "Coulomb", "None", "None", "None", "Water3Water3", "", "Force" },
+    { nb_kernel_ElecCoul_VdwNone_GeomW4P1_VF_sparc64_hpc_ace_double, "nb_kernel_ElecCoul_VdwNone_GeomW4P1_VF_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "Coulomb", "None", "None", "None", "Water4Particle", "", "PotentialAndForce" },
+    { nb_kernel_ElecCoul_VdwNone_GeomW4P1_F_sparc64_hpc_ace_double, "nb_kernel_ElecCoul_VdwNone_GeomW4P1_F_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "Coulomb", "None", "None", "None", "Water4Particle", "", "Force" },
+    { nb_kernel_ElecCoul_VdwNone_GeomW4W4_VF_sparc64_hpc_ace_double, "nb_kernel_ElecCoul_VdwNone_GeomW4W4_VF_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "Coulomb", "None", "None", "None", "Water4Water4", "", "PotentialAndForce" },
+    { nb_kernel_ElecCoul_VdwNone_GeomW4W4_F_sparc64_hpc_ace_double, "nb_kernel_ElecCoul_VdwNone_GeomW4W4_F_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "Coulomb", "None", "None", "None", "Water4Water4", "", "Force" },
+    { nb_kernel_ElecCoul_VdwCSTab_GeomP1P1_VF_sparc64_hpc_ace_double, "nb_kernel_ElecCoul_VdwCSTab_GeomP1P1_VF_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "Coulomb", "None", "CubicSplineTable", "None", "ParticleParticle", "", "PotentialAndForce" },
+    { nb_kernel_ElecCoul_VdwCSTab_GeomP1P1_F_sparc64_hpc_ace_double, "nb_kernel_ElecCoul_VdwCSTab_GeomP1P1_F_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "Coulomb", "None", "CubicSplineTable", "None", "ParticleParticle", "", "Force" },
+    { nb_kernel_ElecCoul_VdwCSTab_GeomW3P1_VF_sparc64_hpc_ace_double, "nb_kernel_ElecCoul_VdwCSTab_GeomW3P1_VF_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "Coulomb", "None", "CubicSplineTable", "None", "Water3Particle", "", "PotentialAndForce" },
+    { nb_kernel_ElecCoul_VdwCSTab_GeomW3P1_F_sparc64_hpc_ace_double, "nb_kernel_ElecCoul_VdwCSTab_GeomW3P1_F_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "Coulomb", "None", "CubicSplineTable", "None", "Water3Particle", "", "Force" },
+    { nb_kernel_ElecCoul_VdwCSTab_GeomW3W3_VF_sparc64_hpc_ace_double, "nb_kernel_ElecCoul_VdwCSTab_GeomW3W3_VF_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "Coulomb", "None", "CubicSplineTable", "None", "Water3Water3", "", "PotentialAndForce" },
+    { nb_kernel_ElecCoul_VdwCSTab_GeomW3W3_F_sparc64_hpc_ace_double, "nb_kernel_ElecCoul_VdwCSTab_GeomW3W3_F_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "Coulomb", "None", "CubicSplineTable", "None", "Water3Water3", "", "Force" },
+    { nb_kernel_ElecCoul_VdwCSTab_GeomW4P1_VF_sparc64_hpc_ace_double, "nb_kernel_ElecCoul_VdwCSTab_GeomW4P1_VF_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "Coulomb", "None", "CubicSplineTable", "None", "Water4Particle", "", "PotentialAndForce" },
+    { nb_kernel_ElecCoul_VdwCSTab_GeomW4P1_F_sparc64_hpc_ace_double, "nb_kernel_ElecCoul_VdwCSTab_GeomW4P1_F_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "Coulomb", "None", "CubicSplineTable", "None", "Water4Particle", "", "Force" },
+    { nb_kernel_ElecCoul_VdwCSTab_GeomW4W4_VF_sparc64_hpc_ace_double, "nb_kernel_ElecCoul_VdwCSTab_GeomW4W4_VF_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "Coulomb", "None", "CubicSplineTable", "None", "Water4Water4", "", "PotentialAndForce" },
+    { nb_kernel_ElecCoul_VdwCSTab_GeomW4W4_F_sparc64_hpc_ace_double, "nb_kernel_ElecCoul_VdwCSTab_GeomW4W4_F_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "Coulomb", "None", "CubicSplineTable", "None", "Water4Water4", "", "Force" },
+    { nb_kernel_ElecCSTab_VdwLJ_GeomP1P1_VF_sparc64_hpc_ace_double, "nb_kernel_ElecCSTab_VdwLJ_GeomP1P1_VF_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "CubicSplineTable", "None", "LennardJones", "None", "ParticleParticle", "", "PotentialAndForce" },
+    { nb_kernel_ElecCSTab_VdwLJ_GeomP1P1_F_sparc64_hpc_ace_double, "nb_kernel_ElecCSTab_VdwLJ_GeomP1P1_F_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "CubicSplineTable", "None", "LennardJones", "None", "ParticleParticle", "", "Force" },
+    { nb_kernel_ElecCSTab_VdwLJ_GeomW3P1_VF_sparc64_hpc_ace_double, "nb_kernel_ElecCSTab_VdwLJ_GeomW3P1_VF_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "CubicSplineTable", "None", "LennardJones", "None", "Water3Particle", "", "PotentialAndForce" },
+    { nb_kernel_ElecCSTab_VdwLJ_GeomW3P1_F_sparc64_hpc_ace_double, "nb_kernel_ElecCSTab_VdwLJ_GeomW3P1_F_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "CubicSplineTable", "None", "LennardJones", "None", "Water3Particle", "", "Force" },
+    { nb_kernel_ElecCSTab_VdwLJ_GeomW3W3_VF_sparc64_hpc_ace_double, "nb_kernel_ElecCSTab_VdwLJ_GeomW3W3_VF_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "CubicSplineTable", "None", "LennardJones", "None", "Water3Water3", "", "PotentialAndForce" },
+    { nb_kernel_ElecCSTab_VdwLJ_GeomW3W3_F_sparc64_hpc_ace_double, "nb_kernel_ElecCSTab_VdwLJ_GeomW3W3_F_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "CubicSplineTable", "None", "LennardJones", "None", "Water3Water3", "", "Force" },
+    { nb_kernel_ElecCSTab_VdwLJ_GeomW4P1_VF_sparc64_hpc_ace_double, "nb_kernel_ElecCSTab_VdwLJ_GeomW4P1_VF_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "CubicSplineTable", "None", "LennardJones", "None", "Water4Particle", "", "PotentialAndForce" },
+    { nb_kernel_ElecCSTab_VdwLJ_GeomW4P1_F_sparc64_hpc_ace_double, "nb_kernel_ElecCSTab_VdwLJ_GeomW4P1_F_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "CubicSplineTable", "None", "LennardJones", "None", "Water4Particle", "", "Force" },
+    { nb_kernel_ElecCSTab_VdwLJ_GeomW4W4_VF_sparc64_hpc_ace_double, "nb_kernel_ElecCSTab_VdwLJ_GeomW4W4_VF_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "CubicSplineTable", "None", "LennardJones", "None", "Water4Water4", "", "PotentialAndForce" },
+    { nb_kernel_ElecCSTab_VdwLJ_GeomW4W4_F_sparc64_hpc_ace_double, "nb_kernel_ElecCSTab_VdwLJ_GeomW4W4_F_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "CubicSplineTable", "None", "LennardJones", "None", "Water4Water4", "", "Force" },
+    { nb_kernel_ElecCSTab_VdwNone_GeomP1P1_VF_sparc64_hpc_ace_double, "nb_kernel_ElecCSTab_VdwNone_GeomP1P1_VF_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "CubicSplineTable", "None", "None", "None", "ParticleParticle", "", "PotentialAndForce" },
+    { nb_kernel_ElecCSTab_VdwNone_GeomP1P1_F_sparc64_hpc_ace_double, "nb_kernel_ElecCSTab_VdwNone_GeomP1P1_F_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "CubicSplineTable", "None", "None", "None", "ParticleParticle", "", "Force" },
+    { nb_kernel_ElecCSTab_VdwNone_GeomW3P1_VF_sparc64_hpc_ace_double, "nb_kernel_ElecCSTab_VdwNone_GeomW3P1_VF_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "CubicSplineTable", "None", "None", "None", "Water3Particle", "", "PotentialAndForce" },
+    { nb_kernel_ElecCSTab_VdwNone_GeomW3P1_F_sparc64_hpc_ace_double, "nb_kernel_ElecCSTab_VdwNone_GeomW3P1_F_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "CubicSplineTable", "None", "None", "None", "Water3Particle", "", "Force" },
+    { nb_kernel_ElecCSTab_VdwNone_GeomW3W3_VF_sparc64_hpc_ace_double, "nb_kernel_ElecCSTab_VdwNone_GeomW3W3_VF_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "CubicSplineTable", "None", "None", "None", "Water3Water3", "", "PotentialAndForce" },
+    { nb_kernel_ElecCSTab_VdwNone_GeomW3W3_F_sparc64_hpc_ace_double, "nb_kernel_ElecCSTab_VdwNone_GeomW3W3_F_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "CubicSplineTable", "None", "None", "None", "Water3Water3", "", "Force" },
+    { nb_kernel_ElecCSTab_VdwNone_GeomW4P1_VF_sparc64_hpc_ace_double, "nb_kernel_ElecCSTab_VdwNone_GeomW4P1_VF_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "CubicSplineTable", "None", "None", "None", "Water4Particle", "", "PotentialAndForce" },
+    { nb_kernel_ElecCSTab_VdwNone_GeomW4P1_F_sparc64_hpc_ace_double, "nb_kernel_ElecCSTab_VdwNone_GeomW4P1_F_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "CubicSplineTable", "None", "None", "None", "Water4Particle", "", "Force" },
+    { nb_kernel_ElecCSTab_VdwNone_GeomW4W4_VF_sparc64_hpc_ace_double, "nb_kernel_ElecCSTab_VdwNone_GeomW4W4_VF_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "CubicSplineTable", "None", "None", "None", "Water4Water4", "", "PotentialAndForce" },
+    { nb_kernel_ElecCSTab_VdwNone_GeomW4W4_F_sparc64_hpc_ace_double, "nb_kernel_ElecCSTab_VdwNone_GeomW4W4_F_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "CubicSplineTable", "None", "None", "None", "Water4Water4", "", "Force" },
+    { nb_kernel_ElecCSTab_VdwCSTab_GeomP1P1_VF_sparc64_hpc_ace_double, "nb_kernel_ElecCSTab_VdwCSTab_GeomP1P1_VF_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "CubicSplineTable", "None", "CubicSplineTable", "None", "ParticleParticle", "", "PotentialAndForce" },
+    { nb_kernel_ElecCSTab_VdwCSTab_GeomP1P1_F_sparc64_hpc_ace_double, "nb_kernel_ElecCSTab_VdwCSTab_GeomP1P1_F_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "CubicSplineTable", "None", "CubicSplineTable", "None", "ParticleParticle", "", "Force" },
+    { nb_kernel_ElecCSTab_VdwCSTab_GeomW3P1_VF_sparc64_hpc_ace_double, "nb_kernel_ElecCSTab_VdwCSTab_GeomW3P1_VF_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "CubicSplineTable", "None", "CubicSplineTable", "None", "Water3Particle", "", "PotentialAndForce" },
+    { nb_kernel_ElecCSTab_VdwCSTab_GeomW3P1_F_sparc64_hpc_ace_double, "nb_kernel_ElecCSTab_VdwCSTab_GeomW3P1_F_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "CubicSplineTable", "None", "CubicSplineTable", "None", "Water3Particle", "", "Force" },
+    { nb_kernel_ElecCSTab_VdwCSTab_GeomW3W3_VF_sparc64_hpc_ace_double, "nb_kernel_ElecCSTab_VdwCSTab_GeomW3W3_VF_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "CubicSplineTable", "None", "CubicSplineTable", "None", "Water3Water3", "", "PotentialAndForce" },
+    { nb_kernel_ElecCSTab_VdwCSTab_GeomW3W3_F_sparc64_hpc_ace_double, "nb_kernel_ElecCSTab_VdwCSTab_GeomW3W3_F_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "CubicSplineTable", "None", "CubicSplineTable", "None", "Water3Water3", "", "Force" },
+    { nb_kernel_ElecCSTab_VdwCSTab_GeomW4P1_VF_sparc64_hpc_ace_double, "nb_kernel_ElecCSTab_VdwCSTab_GeomW4P1_VF_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "CubicSplineTable", "None", "CubicSplineTable", "None", "Water4Particle", "", "PotentialAndForce" },
+    { nb_kernel_ElecCSTab_VdwCSTab_GeomW4P1_F_sparc64_hpc_ace_double, "nb_kernel_ElecCSTab_VdwCSTab_GeomW4P1_F_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "CubicSplineTable", "None", "CubicSplineTable", "None", "Water4Particle", "", "Force" },
+    { nb_kernel_ElecCSTab_VdwCSTab_GeomW4W4_VF_sparc64_hpc_ace_double, "nb_kernel_ElecCSTab_VdwCSTab_GeomW4W4_VF_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "CubicSplineTable", "None", "CubicSplineTable", "None", "Water4Water4", "", "PotentialAndForce" },
+    { nb_kernel_ElecCSTab_VdwCSTab_GeomW4W4_F_sparc64_hpc_ace_double, "nb_kernel_ElecCSTab_VdwCSTab_GeomW4W4_F_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "CubicSplineTable", "None", "CubicSplineTable", "None", "Water4Water4", "", "Force" },
+    { nb_kernel_ElecGB_VdwLJ_GeomP1P1_VF_sparc64_hpc_ace_double, "nb_kernel_ElecGB_VdwLJ_GeomP1P1_VF_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "GeneralizedBorn", "None", "LennardJones", "None", "ParticleParticle", "", "PotentialAndForce" },
+    { nb_kernel_ElecGB_VdwLJ_GeomP1P1_F_sparc64_hpc_ace_double, "nb_kernel_ElecGB_VdwLJ_GeomP1P1_F_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "GeneralizedBorn", "None", "LennardJones", "None", "ParticleParticle", "", "Force" },
+    { nb_kernel_ElecGB_VdwNone_GeomP1P1_VF_sparc64_hpc_ace_double, "nb_kernel_ElecGB_VdwNone_GeomP1P1_VF_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "GeneralizedBorn", "None", "None", "None", "ParticleParticle", "", "PotentialAndForce" },
+    { nb_kernel_ElecGB_VdwNone_GeomP1P1_F_sparc64_hpc_ace_double, "nb_kernel_ElecGB_VdwNone_GeomP1P1_F_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "GeneralizedBorn", "None", "None", "None", "ParticleParticle", "", "Force" },
+    { nb_kernel_ElecGB_VdwCSTab_GeomP1P1_VF_sparc64_hpc_ace_double, "nb_kernel_ElecGB_VdwCSTab_GeomP1P1_VF_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "GeneralizedBorn", "None", "CubicSplineTable", "None", "ParticleParticle", "", "PotentialAndForce" },
+    { nb_kernel_ElecGB_VdwCSTab_GeomP1P1_F_sparc64_hpc_ace_double, "nb_kernel_ElecGB_VdwCSTab_GeomP1P1_F_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "GeneralizedBorn", "None", "CubicSplineTable", "None", "ParticleParticle", "", "Force" },
+    { nb_kernel_ElecRFCut_VdwLJSh_GeomP1P1_VF_sparc64_hpc_ace_double, "nb_kernel_ElecRFCut_VdwLJSh_GeomP1P1_VF_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "ReactionField", "ExactCutoff", "LennardJones", "PotentialShift", "ParticleParticle", "", "PotentialAndForce" },
+    { nb_kernel_ElecRFCut_VdwLJSh_GeomP1P1_F_sparc64_hpc_ace_double, "nb_kernel_ElecRFCut_VdwLJSh_GeomP1P1_F_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "ReactionField", "ExactCutoff", "LennardJones", "PotentialShift", "ParticleParticle", "", "Force" },
+    { nb_kernel_ElecRFCut_VdwLJSh_GeomW3P1_VF_sparc64_hpc_ace_double, "nb_kernel_ElecRFCut_VdwLJSh_GeomW3P1_VF_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "ReactionField", "ExactCutoff", "LennardJones", "PotentialShift", "Water3Particle", "", "PotentialAndForce" },
+    { nb_kernel_ElecRFCut_VdwLJSh_GeomW3P1_F_sparc64_hpc_ace_double, "nb_kernel_ElecRFCut_VdwLJSh_GeomW3P1_F_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "ReactionField", "ExactCutoff", "LennardJones", "PotentialShift", "Water3Particle", "", "Force" },
+    { nb_kernel_ElecRFCut_VdwLJSh_GeomW3W3_VF_sparc64_hpc_ace_double, "nb_kernel_ElecRFCut_VdwLJSh_GeomW3W3_VF_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "ReactionField", "ExactCutoff", "LennardJones", "PotentialShift", "Water3Water3", "", "PotentialAndForce" },
+    { nb_kernel_ElecRFCut_VdwLJSh_GeomW3W3_F_sparc64_hpc_ace_double, "nb_kernel_ElecRFCut_VdwLJSh_GeomW3W3_F_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "ReactionField", "ExactCutoff", "LennardJones", "PotentialShift", "Water3Water3", "", "Force" },
+    { nb_kernel_ElecRFCut_VdwLJSh_GeomW4P1_VF_sparc64_hpc_ace_double, "nb_kernel_ElecRFCut_VdwLJSh_GeomW4P1_VF_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "ReactionField", "ExactCutoff", "LennardJones", "PotentialShift", "Water4Particle", "", "PotentialAndForce" },
+    { nb_kernel_ElecRFCut_VdwLJSh_GeomW4P1_F_sparc64_hpc_ace_double, "nb_kernel_ElecRFCut_VdwLJSh_GeomW4P1_F_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "ReactionField", "ExactCutoff", "LennardJones", "PotentialShift", "Water4Particle", "", "Force" },
+    { nb_kernel_ElecRFCut_VdwLJSh_GeomW4W4_VF_sparc64_hpc_ace_double, "nb_kernel_ElecRFCut_VdwLJSh_GeomW4W4_VF_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "ReactionField", "ExactCutoff", "LennardJones", "PotentialShift", "Water4Water4", "", "PotentialAndForce" },
+    { nb_kernel_ElecRFCut_VdwLJSh_GeomW4W4_F_sparc64_hpc_ace_double, "nb_kernel_ElecRFCut_VdwLJSh_GeomW4W4_F_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "ReactionField", "ExactCutoff", "LennardJones", "PotentialShift", "Water4Water4", "", "Force" },
+    { nb_kernel_ElecRFCut_VdwLJSw_GeomP1P1_VF_sparc64_hpc_ace_double, "nb_kernel_ElecRFCut_VdwLJSw_GeomP1P1_VF_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "ReactionField", "ExactCutoff", "LennardJones", "PotentialSwitch", "ParticleParticle", "", "PotentialAndForce" },
+    { nb_kernel_ElecRFCut_VdwLJSw_GeomP1P1_F_sparc64_hpc_ace_double, "nb_kernel_ElecRFCut_VdwLJSw_GeomP1P1_F_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "ReactionField", "ExactCutoff", "LennardJones", "PotentialSwitch", "ParticleParticle", "", "Force" },
+    { nb_kernel_ElecRFCut_VdwLJSw_GeomW3P1_VF_sparc64_hpc_ace_double, "nb_kernel_ElecRFCut_VdwLJSw_GeomW3P1_VF_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "ReactionField", "ExactCutoff", "LennardJones", "PotentialSwitch", "Water3Particle", "", "PotentialAndForce" },
+    { nb_kernel_ElecRFCut_VdwLJSw_GeomW3P1_F_sparc64_hpc_ace_double, "nb_kernel_ElecRFCut_VdwLJSw_GeomW3P1_F_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "ReactionField", "ExactCutoff", "LennardJones", "PotentialSwitch", "Water3Particle", "", "Force" },
+    { nb_kernel_ElecRFCut_VdwLJSw_GeomW3W3_VF_sparc64_hpc_ace_double, "nb_kernel_ElecRFCut_VdwLJSw_GeomW3W3_VF_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "ReactionField", "ExactCutoff", "LennardJones", "PotentialSwitch", "Water3Water3", "", "PotentialAndForce" },
+    { nb_kernel_ElecRFCut_VdwLJSw_GeomW3W3_F_sparc64_hpc_ace_double, "nb_kernel_ElecRFCut_VdwLJSw_GeomW3W3_F_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "ReactionField", "ExactCutoff", "LennardJones", "PotentialSwitch", "Water3Water3", "", "Force" },
+    { nb_kernel_ElecRFCut_VdwLJSw_GeomW4P1_VF_sparc64_hpc_ace_double, "nb_kernel_ElecRFCut_VdwLJSw_GeomW4P1_VF_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "ReactionField", "ExactCutoff", "LennardJones", "PotentialSwitch", "Water4Particle", "", "PotentialAndForce" },
+    { nb_kernel_ElecRFCut_VdwLJSw_GeomW4P1_F_sparc64_hpc_ace_double, "nb_kernel_ElecRFCut_VdwLJSw_GeomW4P1_F_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "ReactionField", "ExactCutoff", "LennardJones", "PotentialSwitch", "Water4Particle", "", "Force" },
+    { nb_kernel_ElecRFCut_VdwLJSw_GeomW4W4_VF_sparc64_hpc_ace_double, "nb_kernel_ElecRFCut_VdwLJSw_GeomW4W4_VF_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "ReactionField", "ExactCutoff", "LennardJones", "PotentialSwitch", "Water4Water4", "", "PotentialAndForce" },
+    { nb_kernel_ElecRFCut_VdwLJSw_GeomW4W4_F_sparc64_hpc_ace_double, "nb_kernel_ElecRFCut_VdwLJSw_GeomW4W4_F_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "ReactionField", "ExactCutoff", "LennardJones", "PotentialSwitch", "Water4Water4", "", "Force" },
+    { nb_kernel_ElecRFCut_VdwNone_GeomP1P1_VF_sparc64_hpc_ace_double, "nb_kernel_ElecRFCut_VdwNone_GeomP1P1_VF_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "ReactionField", "ExactCutoff", "None", "None", "ParticleParticle", "", "PotentialAndForce" },
+    { nb_kernel_ElecRFCut_VdwNone_GeomP1P1_F_sparc64_hpc_ace_double, "nb_kernel_ElecRFCut_VdwNone_GeomP1P1_F_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "ReactionField", "ExactCutoff", "None", "None", "ParticleParticle", "", "Force" },
+    { nb_kernel_ElecRFCut_VdwNone_GeomW3P1_VF_sparc64_hpc_ace_double, "nb_kernel_ElecRFCut_VdwNone_GeomW3P1_VF_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "ReactionField", "ExactCutoff", "None", "None", "Water3Particle", "", "PotentialAndForce" },
+    { nb_kernel_ElecRFCut_VdwNone_GeomW3P1_F_sparc64_hpc_ace_double, "nb_kernel_ElecRFCut_VdwNone_GeomW3P1_F_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "ReactionField", "ExactCutoff", "None", "None", "Water3Particle", "", "Force" },
+    { nb_kernel_ElecRFCut_VdwNone_GeomW3W3_VF_sparc64_hpc_ace_double, "nb_kernel_ElecRFCut_VdwNone_GeomW3W3_VF_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "ReactionField", "ExactCutoff", "None", "None", "Water3Water3", "", "PotentialAndForce" },
+    { nb_kernel_ElecRFCut_VdwNone_GeomW3W3_F_sparc64_hpc_ace_double, "nb_kernel_ElecRFCut_VdwNone_GeomW3W3_F_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "ReactionField", "ExactCutoff", "None", "None", "Water3Water3", "", "Force" },
+    { nb_kernel_ElecRFCut_VdwNone_GeomW4P1_VF_sparc64_hpc_ace_double, "nb_kernel_ElecRFCut_VdwNone_GeomW4P1_VF_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "ReactionField", "ExactCutoff", "None", "None", "Water4Particle", "", "PotentialAndForce" },
+    { nb_kernel_ElecRFCut_VdwNone_GeomW4P1_F_sparc64_hpc_ace_double, "nb_kernel_ElecRFCut_VdwNone_GeomW4P1_F_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "ReactionField", "ExactCutoff", "None", "None", "Water4Particle", "", "Force" },
+    { nb_kernel_ElecRFCut_VdwNone_GeomW4W4_VF_sparc64_hpc_ace_double, "nb_kernel_ElecRFCut_VdwNone_GeomW4W4_VF_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "ReactionField", "ExactCutoff", "None", "None", "Water4Water4", "", "PotentialAndForce" },
+    { nb_kernel_ElecRFCut_VdwNone_GeomW4W4_F_sparc64_hpc_ace_double, "nb_kernel_ElecRFCut_VdwNone_GeomW4W4_F_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "ReactionField", "ExactCutoff", "None", "None", "Water4Water4", "", "Force" },
+    { nb_kernel_ElecRFCut_VdwCSTab_GeomP1P1_VF_sparc64_hpc_ace_double, "nb_kernel_ElecRFCut_VdwCSTab_GeomP1P1_VF_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "ReactionField", "ExactCutoff", "CubicSplineTable", "None", "ParticleParticle", "", "PotentialAndForce" },
+    { nb_kernel_ElecRFCut_VdwCSTab_GeomP1P1_F_sparc64_hpc_ace_double, "nb_kernel_ElecRFCut_VdwCSTab_GeomP1P1_F_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "ReactionField", "ExactCutoff", "CubicSplineTable", "None", "ParticleParticle", "", "Force" },
+    { nb_kernel_ElecRFCut_VdwCSTab_GeomW3P1_VF_sparc64_hpc_ace_double, "nb_kernel_ElecRFCut_VdwCSTab_GeomW3P1_VF_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "ReactionField", "ExactCutoff", "CubicSplineTable", "None", "Water3Particle", "", "PotentialAndForce" },
+    { nb_kernel_ElecRFCut_VdwCSTab_GeomW3P1_F_sparc64_hpc_ace_double, "nb_kernel_ElecRFCut_VdwCSTab_GeomW3P1_F_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "ReactionField", "ExactCutoff", "CubicSplineTable", "None", "Water3Particle", "", "Force" },
+    { nb_kernel_ElecRFCut_VdwCSTab_GeomW3W3_VF_sparc64_hpc_ace_double, "nb_kernel_ElecRFCut_VdwCSTab_GeomW3W3_VF_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "ReactionField", "ExactCutoff", "CubicSplineTable", "None", "Water3Water3", "", "PotentialAndForce" },
+    { nb_kernel_ElecRFCut_VdwCSTab_GeomW3W3_F_sparc64_hpc_ace_double, "nb_kernel_ElecRFCut_VdwCSTab_GeomW3W3_F_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "ReactionField", "ExactCutoff", "CubicSplineTable", "None", "Water3Water3", "", "Force" },
+    { nb_kernel_ElecRFCut_VdwCSTab_GeomW4P1_VF_sparc64_hpc_ace_double, "nb_kernel_ElecRFCut_VdwCSTab_GeomW4P1_VF_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "ReactionField", "ExactCutoff", "CubicSplineTable", "None", "Water4Particle", "", "PotentialAndForce" },
+    { nb_kernel_ElecRFCut_VdwCSTab_GeomW4P1_F_sparc64_hpc_ace_double, "nb_kernel_ElecRFCut_VdwCSTab_GeomW4P1_F_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "ReactionField", "ExactCutoff", "CubicSplineTable", "None", "Water4Particle", "", "Force" },
+    { nb_kernel_ElecRFCut_VdwCSTab_GeomW4W4_VF_sparc64_hpc_ace_double, "nb_kernel_ElecRFCut_VdwCSTab_GeomW4W4_VF_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "ReactionField", "ExactCutoff", "CubicSplineTable", "None", "Water4Water4", "", "PotentialAndForce" },
+    { nb_kernel_ElecRFCut_VdwCSTab_GeomW4W4_F_sparc64_hpc_ace_double, "nb_kernel_ElecRFCut_VdwCSTab_GeomW4W4_F_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "ReactionField", "ExactCutoff", "CubicSplineTable", "None", "Water4Water4", "", "Force" },
+    { nb_kernel_ElecRF_VdwLJ_GeomP1P1_VF_sparc64_hpc_ace_double, "nb_kernel_ElecRF_VdwLJ_GeomP1P1_VF_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "ReactionField", "None", "LennardJones", "None", "ParticleParticle", "", "PotentialAndForce" },
+    { nb_kernel_ElecRF_VdwLJ_GeomP1P1_F_sparc64_hpc_ace_double, "nb_kernel_ElecRF_VdwLJ_GeomP1P1_F_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "ReactionField", "None", "LennardJones", "None", "ParticleParticle", "", "Force" },
+    { nb_kernel_ElecRF_VdwLJ_GeomW3P1_VF_sparc64_hpc_ace_double, "nb_kernel_ElecRF_VdwLJ_GeomW3P1_VF_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "ReactionField", "None", "LennardJones", "None", "Water3Particle", "", "PotentialAndForce" },
+    { nb_kernel_ElecRF_VdwLJ_GeomW3P1_F_sparc64_hpc_ace_double, "nb_kernel_ElecRF_VdwLJ_GeomW3P1_F_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "ReactionField", "None", "LennardJones", "None", "Water3Particle", "", "Force" },
+    { nb_kernel_ElecRF_VdwLJ_GeomW3W3_VF_sparc64_hpc_ace_double, "nb_kernel_ElecRF_VdwLJ_GeomW3W3_VF_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "ReactionField", "None", "LennardJones", "None", "Water3Water3", "", "PotentialAndForce" },
+    { nb_kernel_ElecRF_VdwLJ_GeomW3W3_F_sparc64_hpc_ace_double, "nb_kernel_ElecRF_VdwLJ_GeomW3W3_F_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "ReactionField", "None", "LennardJones", "None", "Water3Water3", "", "Force" },
+    { nb_kernel_ElecRF_VdwLJ_GeomW4P1_VF_sparc64_hpc_ace_double, "nb_kernel_ElecRF_VdwLJ_GeomW4P1_VF_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "ReactionField", "None", "LennardJones", "None", "Water4Particle", "", "PotentialAndForce" },
+    { nb_kernel_ElecRF_VdwLJ_GeomW4P1_F_sparc64_hpc_ace_double, "nb_kernel_ElecRF_VdwLJ_GeomW4P1_F_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "ReactionField", "None", "LennardJones", "None", "Water4Particle", "", "Force" },
+    { nb_kernel_ElecRF_VdwLJ_GeomW4W4_VF_sparc64_hpc_ace_double, "nb_kernel_ElecRF_VdwLJ_GeomW4W4_VF_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "ReactionField", "None", "LennardJones", "None", "Water4Water4", "", "PotentialAndForce" },
+    { nb_kernel_ElecRF_VdwLJ_GeomW4W4_F_sparc64_hpc_ace_double, "nb_kernel_ElecRF_VdwLJ_GeomW4W4_F_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "ReactionField", "None", "LennardJones", "None", "Water4Water4", "", "Force" },
+    { nb_kernel_ElecRF_VdwNone_GeomP1P1_VF_sparc64_hpc_ace_double, "nb_kernel_ElecRF_VdwNone_GeomP1P1_VF_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "ReactionField", "None", "None", "None", "ParticleParticle", "", "PotentialAndForce" },
+    { nb_kernel_ElecRF_VdwNone_GeomP1P1_F_sparc64_hpc_ace_double, "nb_kernel_ElecRF_VdwNone_GeomP1P1_F_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "ReactionField", "None", "None", "None", "ParticleParticle", "", "Force" },
+    { nb_kernel_ElecRF_VdwNone_GeomW3P1_VF_sparc64_hpc_ace_double, "nb_kernel_ElecRF_VdwNone_GeomW3P1_VF_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "ReactionField", "None", "None", "None", "Water3Particle", "", "PotentialAndForce" },
+    { nb_kernel_ElecRF_VdwNone_GeomW3P1_F_sparc64_hpc_ace_double, "nb_kernel_ElecRF_VdwNone_GeomW3P1_F_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "ReactionField", "None", "None", "None", "Water3Particle", "", "Force" },
+    { nb_kernel_ElecRF_VdwNone_GeomW3W3_VF_sparc64_hpc_ace_double, "nb_kernel_ElecRF_VdwNone_GeomW3W3_VF_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "ReactionField", "None", "None", "None", "Water3Water3", "", "PotentialAndForce" },
+    { nb_kernel_ElecRF_VdwNone_GeomW3W3_F_sparc64_hpc_ace_double, "nb_kernel_ElecRF_VdwNone_GeomW3W3_F_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "ReactionField", "None", "None", "None", "Water3Water3", "", "Force" },
+    { nb_kernel_ElecRF_VdwNone_GeomW4P1_VF_sparc64_hpc_ace_double, "nb_kernel_ElecRF_VdwNone_GeomW4P1_VF_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "ReactionField", "None", "None", "None", "Water4Particle", "", "PotentialAndForce" },
+    { nb_kernel_ElecRF_VdwNone_GeomW4P1_F_sparc64_hpc_ace_double, "nb_kernel_ElecRF_VdwNone_GeomW4P1_F_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "ReactionField", "None", "None", "None", "Water4Particle", "", "Force" },
+    { nb_kernel_ElecRF_VdwNone_GeomW4W4_VF_sparc64_hpc_ace_double, "nb_kernel_ElecRF_VdwNone_GeomW4W4_VF_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "ReactionField", "None", "None", "None", "Water4Water4", "", "PotentialAndForce" },
+    { nb_kernel_ElecRF_VdwNone_GeomW4W4_F_sparc64_hpc_ace_double, "nb_kernel_ElecRF_VdwNone_GeomW4W4_F_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "ReactionField", "None", "None", "None", "Water4Water4", "", "Force" },
+    { nb_kernel_ElecRF_VdwCSTab_GeomP1P1_VF_sparc64_hpc_ace_double, "nb_kernel_ElecRF_VdwCSTab_GeomP1P1_VF_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "ReactionField", "None", "CubicSplineTable", "None", "ParticleParticle", "", "PotentialAndForce" },
+    { nb_kernel_ElecRF_VdwCSTab_GeomP1P1_F_sparc64_hpc_ace_double, "nb_kernel_ElecRF_VdwCSTab_GeomP1P1_F_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "ReactionField", "None", "CubicSplineTable", "None", "ParticleParticle", "", "Force" },
+    { nb_kernel_ElecRF_VdwCSTab_GeomW3P1_VF_sparc64_hpc_ace_double, "nb_kernel_ElecRF_VdwCSTab_GeomW3P1_VF_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "ReactionField", "None", "CubicSplineTable", "None", "Water3Particle", "", "PotentialAndForce" },
+    { nb_kernel_ElecRF_VdwCSTab_GeomW3P1_F_sparc64_hpc_ace_double, "nb_kernel_ElecRF_VdwCSTab_GeomW3P1_F_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "ReactionField", "None", "CubicSplineTable", "None", "Water3Particle", "", "Force" },
+    { nb_kernel_ElecRF_VdwCSTab_GeomW3W3_VF_sparc64_hpc_ace_double, "nb_kernel_ElecRF_VdwCSTab_GeomW3W3_VF_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "ReactionField", "None", "CubicSplineTable", "None", "Water3Water3", "", "PotentialAndForce" },
+    { nb_kernel_ElecRF_VdwCSTab_GeomW3W3_F_sparc64_hpc_ace_double, "nb_kernel_ElecRF_VdwCSTab_GeomW3W3_F_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "ReactionField", "None", "CubicSplineTable", "None", "Water3Water3", "", "Force" },
+    { nb_kernel_ElecRF_VdwCSTab_GeomW4P1_VF_sparc64_hpc_ace_double, "nb_kernel_ElecRF_VdwCSTab_GeomW4P1_VF_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "ReactionField", "None", "CubicSplineTable", "None", "Water4Particle", "", "PotentialAndForce" },
+    { nb_kernel_ElecRF_VdwCSTab_GeomW4P1_F_sparc64_hpc_ace_double, "nb_kernel_ElecRF_VdwCSTab_GeomW4P1_F_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "ReactionField", "None", "CubicSplineTable", "None", "Water4Particle", "", "Force" },
+    { nb_kernel_ElecRF_VdwCSTab_GeomW4W4_VF_sparc64_hpc_ace_double, "nb_kernel_ElecRF_VdwCSTab_GeomW4W4_VF_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "ReactionField", "None", "CubicSplineTable", "None", "Water4Water4", "", "PotentialAndForce" },
+    { nb_kernel_ElecRF_VdwCSTab_GeomW4W4_F_sparc64_hpc_ace_double, "nb_kernel_ElecRF_VdwCSTab_GeomW4W4_F_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "ReactionField", "None", "CubicSplineTable", "None", "Water4Water4", "", "Force" }
+};
+
+int
+kernellist_sparc64_hpc_ace_double_size = sizeof(kernellist_sparc64_hpc_ace_double)/sizeof(kernellist_sparc64_hpc_ace_double[0]);
+
+#endif
diff --git a/src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_sparc64_hpc_ace_double.h b/src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_sparc64_hpc_ace_double.h
new file mode 100644 (file)
index 0000000..afb925b
--- /dev/null
@@ -0,0 +1,36 @@
+/*
+ * Note: this file was generated by the Gromacs c kernel generator.
+ *
+ *                This source code is part of
+ *
+ *                 G   R   O   M   A   C   S
+ *
+ * Copyright (c) 2001-2012, The GROMACS Development Team
+ *
+ * Gromacs is a library for molecular simulation and trajectory analysis,
+ * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
+ * a full list of developers and information, check out http://www.gromacs.org
+ *
+ * This program is free software; you can redistribute it and/or modify it under
+ * the terms of the GNU Lesser General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option) any
+ * later version.
+ *
+ * To help fund GROMACS development, we humbly ask that you cite
+ * the papers people have written on it - you can find them on the website.
+ */
+#ifndef nb_kernel_sparc64_hpc_ace_double_h
+#define nb_kernel_sparc64_hpc_ace_double_h
+
+#include "../nb_kernel.h"
+
+
+/* List of kernels for this architecture with metadata about them */
+extern nb_kernel_info_t
+kernellist_sparc64_hpc_ace_double[];
+
+/* Length of kernellist_c */
+extern int
+kernellist_sparc64_hpc_ace_double_size;
+
+#endif
diff --git a/src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_template_sparc64_hpc_ace_double.pre b/src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_template_sparc64_hpc_ace_double.pre
new file mode 100644 (file)
index 0000000..1349445
--- /dev/null
@@ -0,0 +1,1086 @@
+/* ## */
+/* ## This file is part of the GROMACS molecular simulation package. */
+/* ## */
+/* ## Copyright (c) 2012, by the GROMACS development team, led by */
+/* ## David van der Spoel, Berk Hess, Erik Lindahl, and including many */
+/* ## others, as listed in the AUTHORS file in the top-level source */
+/* ## directory and at http://www.gromacs.org. */
+/* ## */
+/* ## GROMACS is free software; you can redistribute it and/or */
+/* ## modify it under the terms of the GNU Lesser General Public License */
+/* ## as published by the Free Software Foundation; either version 2.1 */
+/* ## of the License, or (at your option) any later version. */
+/* ## */
+/* ## GROMACS is distributed in the hope that it will be useful, */
+/* ## but WITHOUT ANY WARRANTY; without even the implied warranty of */
+/* ## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU */
+/* ## Lesser General Public License for more details. */
+/* ## */
+/* ## You should have received a copy of the GNU Lesser General Public */
+/* ## License along with GROMACS; if not, see */
+/* ## http://www.gnu.org/licenses, or write to the Free Software Foundation, */
+/* ## Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA. */
+/* ## */
+/* ## If you want to redistribute modifications to GROMACS, please */
+/* ## consider that scientific software is very special. Version */
+/* ## control is crucial - bugs must be traceable. We will be happy to */
+/* ## consider code for inclusion in the official distribution, but */
+/* ## derived work must not be called official GROMACS. Details are found */
+/* ## in the README & COPYING files - if they are missing, get the */
+/* ## official version at http://www.gromacs.org. */
+/* ## */
+/* ## To help us fund GROMACS development, we humbly ask that you cite */
+/* ## the research papers on the package. Check out http://www.gromacs.org. */
+/* ## */
+/* #if 0 */
+#error This file must be processed with the Gromacs pre-preprocessor
+/* #endif */
+/* #if INCLUDE_HEADER */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "kernelutil_sparc64_hpc_ace_double.h"
+/* #endif */
+
+/* ## List of variables set by the generating script:                                    */
+/* ##                                                                                    */
+/* ## Setttings that apply to the entire kernel:                                         */
+/* ## KERNEL_ELEC:           String, choice for electrostatic interactions               */
+/* ## KERNEL_VDW:            String, choice for van der Waals interactions               */
+/* ## KERNEL_NAME:           String, name of this kernel                                 */
+/* ## KERNEL_VF:             String telling if we calculate potential, force, or both    */
+/* ## GEOMETRY_I/GEOMETRY_J: String, name of each geometry, e.g. 'Water3' or '1Particle' */
+/* ##                                                                                    */
+/* ## Setttings that apply to particles in the outer (I) or inner (J) loops:             */
+/* ## PARTICLES_I[]/         Arrays with lists of i/j particles to use in kernel. It is  */
+/* ## PARTICLES_J[]:         just [0] for particle geometry, but can be longer for water */
+/* ## PARTICLES_ELEC_I[]/    Arrays with lists of i/j particle that have electrostatics  */
+/* ## PARTICLES_ELEC_J[]:    interactions that should be calculated in this kernel.      */
+/* ## PARTICLES_VDW_I[]/     Arrays with the list of i/j particle that have VdW          */
+/* ## PARTICLES_VDW_J[]:     interactions that should be calculated in this kernel.      */
+/* ##                                                                                    */
+/* ## Setttings for pairs of interactions (e.g. 2nd i particle against 1st j particle)   */
+/* ## PAIRS_IJ[]:            Array with (i,j) tuples of pairs for which interactions     */
+/* ##                        should be calculated in this kernel. Zero-charge particles  */
+/* ##                        do not have interactions with particles without vdw, and    */
+/* ##                        Vdw-only interactions are not evaluated in a no-vdw-kernel. */
+/* ## INTERACTION_FLAGS[][]: 2D matrix, dimension e.g. 3*3 for water-water interactions. */
+/* ##                        For each i-j pair, the element [I][J] is a list of strings  */
+/* ##                        defining properties/flags of this interaction. Examples     */
+/* ##                        include 'electrostatics'/'vdw' if that type of interaction  */
+/* ##                        should be evaluated, 'rsq'/'rinv'/'rinvsq' if those values  */
+/* ##                        are needed, and 'exactcutoff' or 'shift','switch' to        */
+/* ##                        decide if the force/potential should be modified. This way  */
+/* ##                        we only calculate values absolutely needed for each case.   */
+
+/* ## Calculate the size and offset for (merged/interleaved) table data */
+
+/*
+ * Gromacs nonbonded kernel:   {KERNEL_NAME}
+ * Electrostatics interaction: {KERNEL_ELEC}
+ * VdW interaction:            {KERNEL_VDW}
+ * Geometry:                   {GEOMETRY_I}-{GEOMETRY_J}
+ * Calculate force/pot:        {KERNEL_VF}
+ */
+void
+{KERNEL_NAME}
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* ## Not all variables are used for all kernels, but any optimizing compiler fixes that, */
+    /* ## so there is no point in going to extremes to exclude variables that are not needed. */
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    /* #for I in PARTICLES_I */
+    int              vdwioffset{I};
+    _fjsp_v2r8       ix{I},iy{I},iz{I},fix{I},fiy{I},fiz{I},iq{I},isai{I};
+    /* #endfor */
+    /* #for J in PARTICLES_J */
+    int              vdwjidx{J}A,vdwjidx{J}B;
+    _fjsp_v2r8       jx{J},jy{J},jz{J},fjx{J},fjy{J},fjz{J},jq{J},isaj{J};
+    /* #endfor */
+    /* #for I,J in PAIRS_IJ */
+    _fjsp_v2r8       dx{I}{J},dy{I}{J},dz{I}{J},rsq{I}{J},rinv{I}{J},rinvsq{I}{J},r{I}{J},qq{I}{J},c6_{I}{J},c12_{I}{J};
+    /* #endfor */
+    /* #if KERNEL_ELEC != 'None' */
+    _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    /* #endif */
+    /* #if 'GeneralizedBorn' in KERNEL_ELEC */
+    _fjsp_v2r8       vgb,fgb,vgbsum,dvdasum,gbscale,gbtabscale,isaprod,gbqqfactor,gbinvepsdiff,dvdaj,gbeps,twogbeps,dvdatmp;
+    _fjsp_v2r8       minushalf = gmx_fjsp_set1_v2r8(-0.5);
+    real             *invsqrta,*dvda,*gbtab;
+    /* #endif */
+    /* #if KERNEL_VDW != 'None' */
+    int              nvdwtype;
+    _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
+    _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
+    /* #endif */
+    /* #if 'Table' in KERNEL_ELEC or 'GeneralizedBorn' in KERNEL_ELEC or 'Table' in KERNEL_VDW */
+    _fjsp_v2r8       rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF,twovfeps;
+    real             *vftab;
+    /* #endif */
+    /* #if 'Ewald' in KERNEL_ELEC */
+    _fjsp_v2r8       ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+    real             *ewtab;
+    /* #endif */
+    /* #if 'PotentialSwitch' in [KERNEL_MOD_ELEC,KERNEL_MOD_VDW] */
+    _fjsp_v2r8       rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
+    real             rswitch_scalar,d_scalar;
+    /* #endif */
+    _fjsp_v2r8       itab_tmp;
+    _fjsp_v2r8       dummy_mask,cutoff_mask;
+    _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+    _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+    union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    /* #if KERNEL_ELEC != 'None' */
+    facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    /*     #if 'ReactionField' in KERNEL_ELEC */
+    krf              = gmx_fjsp_set1_v2r8(fr->ic->k_rf);
+    krf2             = gmx_fjsp_set1_v2r8(fr->ic->k_rf*2.0);
+    crf              = gmx_fjsp_set1_v2r8(fr->ic->c_rf);
+    /*     #endif */
+    /* #endif */
+    /* #if KERNEL_VDW != 'None' */
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+    /* #endif */
+
+    /* #if 'Table' in KERNEL_ELEC and 'Table' in KERNEL_VDW */
+    vftab            = kernel_data->table_elec_vdw->data;
+    vftabscale       = gmx_fjsp_set1_v2r8(kernel_data->table_elec_vdw->scale);
+    /* #elif 'Table' in KERNEL_ELEC */
+    vftab            = kernel_data->table_elec->data;
+    vftabscale       = gmx_fjsp_set1_v2r8(kernel_data->table_elec->scale);
+    /* #elif 'Table' in KERNEL_VDW */
+    vftab            = kernel_data->table_vdw->data;
+    vftabscale       = gmx_fjsp_set1_v2r8(kernel_data->table_vdw->scale);
+    /* #endif */
+
+    /* #if 'Ewald' in KERNEL_ELEC */
+    sh_ewald         = gmx_fjsp_set1_v2r8(fr->ic->sh_ewald);
+    /*     #if KERNEL_VF=='Force' and KERNEL_MOD_ELEC!='PotentialSwitch' */
+    ewtab            = fr->ic->tabq_coul_F;
+    ewtabscale       = gmx_fjsp_set1_v2r8(fr->ic->tabq_scale);
+    ewtabhalfspace   = gmx_fjsp_set1_v2r8(0.5/fr->ic->tabq_scale);
+    /*     #else */
+    ewtab            = fr->ic->tabq_coul_FDV0;
+    ewtabscale       = gmx_fjsp_set1_v2r8(fr->ic->tabq_scale);
+    ewtabhalfspace   = gmx_fjsp_set1_v2r8(0.5/fr->ic->tabq_scale);
+     /*     #endif */
+    /* #endif */
+
+    /* #if KERNEL_ELEC=='GeneralizedBorn' */
+    invsqrta         = fr->invsqrta;
+    dvda             = fr->dvda;
+    gbtabscale       = gmx_fjsp_set1_v2r8(fr->gbtab.scale);
+    gbtab            = fr->gbtab.data;
+    gbinvepsdiff     = gmx_fjsp_set1_v2r8((1.0/fr->epsilon_r) - (1.0/fr->gb_epsilon_solvent));
+    /* #endif */
+
+    /* #if 'Water' in GEOMETRY_I */
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    /*     #for I in PARTICLES_ELEC_I */
+    iq{I}              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+{I}]));
+    /*     #endfor */
+    /*     #for I in PARTICLES_VDW_I */
+    vdwioffset{I}      = 2*nvdwtype*vdwtype[inr+{I}];
+    /*     #endfor */
+    /* #endif */
+
+    /* #if 'Water' in GEOMETRY_J */
+    /*     #for J in PARTICLES_ELEC_J */
+    jq{J}              = gmx_fjsp_set1_v2r8(charge[inr+{J}]);
+    /*     #endfor */
+    /*     #for J in PARTICLES_VDW_J */
+    vdwjidx{J}A        = 2*vdwtype[inr+{J}];
+    /*     #endfor */
+    /*     #for I,J in PAIRS_IJ */
+    /*         #if 'electrostatics' in INTERACTION_FLAGS[I][J] */
+    qq{I}{J}             = _fjsp_mul_v2r8(iq{I},jq{J});
+    /*         #endif */
+    /*         #if 'vdw' in INTERACTION_FLAGS[I][J] */
+    c6_{I}{J}            = gmx_fjsp_set1_v2r8(vdwparam[vdwioffset{I}+vdwjidx{J}A]);
+    c12_{I}{J}           = gmx_fjsp_set1_v2r8(vdwparam[vdwioffset{I}+vdwjidx{J}A+1]);
+    /*         #endif */
+    /*     #endfor */
+    /* #endif */
+
+    /* #if KERNEL_MOD_ELEC!='None' or KERNEL_MOD_VDW!='None' */
+    /*     #if KERNEL_ELEC!='None' */
+    /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+    rcutoff_scalar   = fr->rcoulomb;
+    /*     #else */
+    rcutoff_scalar   = fr->rvdw;
+    /*     #endif */
+    rcutoff          = gmx_fjsp_set1_v2r8(rcutoff_scalar);
+    rcutoff2         = _fjsp_mul_v2r8(rcutoff,rcutoff);
+    /* #endif */
+
+    /* #if KERNEL_MOD_VDW=='PotentialShift' */
+    sh_vdw_invrcut6  = gmx_fjsp_set1_v2r8(fr->ic->sh_invrc6);
+    rvdw             = gmx_fjsp_set1_v2r8(fr->rvdw);
+    /* #endif */
+
+    /* #if 'PotentialSwitch' in [KERNEL_MOD_ELEC,KERNEL_MOD_VDW] */
+    /*     #if KERNEL_MOD_ELEC=='PotentialSwitch'  */
+    rswitch_scalar   = fr->rcoulomb_switch;
+    rswitch          = gmx_fjsp_set1_v2r8(rswitch_scalar);
+    /*     #else */
+    rswitch_scalar   = fr->rvdw_switch;
+    rswitch          = gmx_fjsp_set1_v2r8(rswitch_scalar);
+    /*     #endif */
+    /* Setup switch parameters */
+    d_scalar         = rcutoff_scalar-rswitch_scalar;
+    d                = gmx_fjsp_set1_v2r8(d_scalar);
+    swV3             = gmx_fjsp_set1_v2r8(-10.0/(d_scalar*d_scalar*d_scalar));
+    swV4             = gmx_fjsp_set1_v2r8( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar));
+    swV5             = gmx_fjsp_set1_v2r8( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
+    /*     #if 'Force' in KERNEL_VF */
+    swF2             = gmx_fjsp_set1_v2r8(-30.0/(d_scalar*d_scalar*d_scalar));
+    swF3             = gmx_fjsp_set1_v2r8( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar));
+    swF4             = gmx_fjsp_set1_v2r8(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
+    /*     #endif */
+    /* #endif */
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    /* ## Keep track of the floating point operations we issue for reporting! */
+    /* #define OUTERFLOPS 0 */
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        /* #if GEOMETRY_I == 'Particle' */
+        gmx_fjsp_load_shift_and_1rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,&ix0,&iy0,&iz0);
+        /* #elif GEOMETRY_I == 'Water3' */
+        gmx_fjsp_load_shift_and_3rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
+        /* #elif GEOMETRY_I == 'Water4' */
+        /*     #if 0 in PARTICLES_I                 */
+        gmx_fjsp_load_shift_and_4rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
+        /*     #else                                */
+        gmx_fjsp_load_shift_and_3rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset+DIM,
+                                                 &ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
+        /*     #endif                               */
+        /* #endif                                   */
+
+        /* #if 'Force' in KERNEL_VF */
+        /*     #for I in PARTICLES_I */
+        fix{I}             = _fjsp_setzero_v2r8();
+        fiy{I}             = _fjsp_setzero_v2r8();
+        fiz{I}             = _fjsp_setzero_v2r8();
+        /*     #endfor */
+        /* #endif */
+
+        /* ## For water we already preloaded parameters at the start of the kernel */
+        /* #if not 'Water' in GEOMETRY_I */
+        /* Load parameters for i particles */
+        /*     #for I in PARTICLES_ELEC_I */
+        iq{I}              = _fjsp_mul_v2r8(facel,gmx_fjsp_load1_v2r8(charge+inr+{I}));
+        /*         #define OUTERFLOPS OUTERFLOPS+1 */
+        /*         #if KERNEL_ELEC=='GeneralizedBorn' */
+        isai{I}            = gmx_fjsp_load1_v2r8(invsqrta+inr+{I});
+        /*         #endif */
+        /*     #endfor */
+        /*     #for I in PARTICLES_VDW_I */
+        vdwioffset{I}      = 2*nvdwtype*vdwtype[inr+{I}];
+        /*     #endfor */
+        /* #endif */
+
+        /* #if 'Potential' in KERNEL_VF */
+        /* Reset potential sums */
+        /*     #if KERNEL_ELEC != 'None' */
+        velecsum         = _fjsp_setzero_v2r8();
+        /*     #endif */
+        /*     #if 'GeneralizedBorn' in KERNEL_ELEC */
+        vgbsum           = _fjsp_setzero_v2r8();
+        /*     #endif */
+        /*     #if KERNEL_VDW != 'None' */
+        vvdwsum          = _fjsp_setzero_v2r8();
+        /*     #endif */
+        /* #endif */
+        /*     #if 'GeneralizedBorn' in KERNEL_ELEC and 'Force' in KERNEL_VF */
+        dvdasum          = _fjsp_setzero_v2r8();
+        /*     #endif */
+
+        /* #for ROUND in ['Loop','Epilogue'] */
+
+        /* #if ROUND =='Loop' */
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+        /* ## First round is normal loop (next statement resets indentation) */
+        /*     #if 0 */
+        }
+        /*     #endif */
+        /* #else */
+        if(jidx<j_index_end)
+        {
+        /* ## Second round is epilogue */
+        /* #endif */
+        /* #define INNERFLOPS 0 */
+
+            /* #if ROUND =='Loop' */
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            /*     #if GEOMETRY_J == 'Particle'             */
+            gmx_fjsp_load_1rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0);
+            /*     #elif GEOMETRY_J == 'Water3'             */
+            gmx_fjsp_load_3rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+            /*     #elif GEOMETRY_J == 'Water4'             */
+            /*         #if 0 in PARTICLES_J                 */
+            gmx_fjsp_load_4rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
+                                              &jy2,&jz2,&jx3,&jy3,&jz3);
+            /*         #else                                */
+            gmx_fjsp_load_3rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA+DIM,x+j_coord_offsetB+DIM,
+                                              &jx1,&jy1,&jz1,&jx2,&jy2,&jz2,&jx3,&jy3,&jz3);
+            /*         #endif                               */
+            /*     #endif                                   */
+            /* #else */
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            /*     #if GEOMETRY_J == 'Particle'             */
+            gmx_fjsp_load_1rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0);
+            /*     #elif GEOMETRY_J == 'Water3'             */
+            gmx_fjsp_load_3rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+            /*     #elif GEOMETRY_J == 'Water4'             */
+            /*         #if 0 in PARTICLES_J                 */
+            gmx_fjsp_load_4rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
+                                              &jy2,&jz2,&jx3,&jy3,&jz3);
+            /*         #else                                */
+            gmx_fjsp_load_3rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA+DIM,
+                                              &jx1,&jy1,&jz1,&jx2,&jy2,&jz2,&jx3,&jy3,&jz3);
+            /*         #endif                               */
+            /*     #endif                                   */
+            /* #endif */
+
+            /* Calculate displacement vector */
+            /* #for I,J in PAIRS_IJ */
+            dx{I}{J}             = _fjsp_sub_v2r8(ix{I},jx{J});
+            dy{I}{J}             = _fjsp_sub_v2r8(iy{I},jy{J});
+            dz{I}{J}             = _fjsp_sub_v2r8(iz{I},jz{J});
+            /*     #define INNERFLOPS INNERFLOPS+3 */
+            /* #endfor */
+
+            /* Calculate squared distance and things based on it */
+            /* #for I,J in PAIRS_IJ */
+            rsq{I}{J}            = gmx_fjsp_calc_rsq_v2r8(dx{I}{J},dy{I}{J},dz{I}{J});
+            /*     #define INNERFLOPS INNERFLOPS+5 */
+            /* #endfor */
+
+            /* #for I,J in PAIRS_IJ */
+            /*     #if 'rinv' in INTERACTION_FLAGS[I][J] */
+            rinv{I}{J}           = gmx_fjsp_invsqrt_v2r8(rsq{I}{J});
+            /*         #define INNERFLOPS INNERFLOPS+5 */
+            /*     #endif */
+            /* #endfor */
+
+            /* #for I,J in PAIRS_IJ */
+            /*     #if 'rinvsq' in INTERACTION_FLAGS[I][J] */
+            /*         # if 'rinv' not in INTERACTION_FLAGS[I][J] */
+            rinvsq{I}{J}         = gmx_fjsp_inv_v2r8(rsq{I}{J});
+            /*             #define INNERFLOPS INNERFLOPS+4 */
+            /*         #else */
+            rinvsq{I}{J}         = _fjsp_mul_v2r8(rinv{I}{J},rinv{I}{J});
+            /*             #define INNERFLOPS INNERFLOPS+1 */
+            /*         #endif */
+            /*     #endif */
+            /* #endfor */
+
+            /* #if not 'Water' in GEOMETRY_J */
+            /* Load parameters for j particles */
+            /*     #for J in PARTICLES_ELEC_J */
+            /*         #if ROUND =='Loop' */
+            jq{J}              = gmx_fjsp_load_2real_swizzle_v2r8(charge+jnrA+{J},charge+jnrB+{J});
+            /*         #else */
+            jq{J}              = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),charge+jnrA+{J});
+            /*         #endif */
+            /*         #if KERNEL_ELEC=='GeneralizedBorn' */
+            /*             #if ROUND =='Loop' */
+            isaj{J}            = gmx_fjsp_load_2real_swizzle_v2r8(invsqrta+jnrA+{J},invsqrta+jnrB+{J});
+            /*             #else */
+            isaj{J}            = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),invsqrta+jnrA+{J});
+            /*             #endif */
+            /*         #endif */
+            /*     #endfor */
+            /*     #for J in PARTICLES_VDW_J */
+            vdwjidx{J}A        = 2*vdwtype[jnrA+{J}];
+            /*         #if ROUND =='Loop' */
+            vdwjidx{J}B        = 2*vdwtype[jnrB+{J}];
+            /*         #endif */
+            /*     #endfor */
+            /* #endif */
+
+            /* #if 'Force' in KERNEL_VF and not 'Particle' in GEOMETRY_I */
+            /*     #for J in PARTICLES_J */
+            fjx{J}             = _fjsp_setzero_v2r8();
+            fjy{J}             = _fjsp_setzero_v2r8();
+            fjz{J}             = _fjsp_setzero_v2r8();
+            /*     #endfor */
+            /* #endif */
+
+            /* #for I,J in PAIRS_IJ */
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /*     #if 'exactcutoff' in INTERACTION_FLAGS[I][J] */
+            /*         ## We always calculate rinv/rinvsq above to enable pipelineing in compilers (performance tested on x86) */
+            if (gmx_fjsp_any_lt_v2r8(rsq{I}{J},rcutoff2))
+            {
+                /*     #if 0    ## this and the next two lines is a hack to maintain auto-indentation in template file */
+            }
+            /*         #endif */
+            /*         #define INNERFLOPS INNERFLOPS+1 */
+            /*     #endif */
+
+            /*     #if 'r' in INTERACTION_FLAGS[I][J] */
+            r{I}{J}              = _fjsp_mul_v2r8(rsq{I}{J},rinv{I}{J});
+             /*         #define INNERFLOPS INNERFLOPS+1 */
+            /*     #endif */
+
+            /*     ## For water geometries we already loaded parameters at the start of the kernel */
+            /*     #if not 'Water' in GEOMETRY_J */
+            /* Compute parameters for interactions between i and j atoms */
+            /*         #if 'electrostatics' in INTERACTION_FLAGS[I][J] */
+            qq{I}{J}             = _fjsp_mul_v2r8(iq{I},jq{J});
+            /*             #define INNERFLOPS INNERFLOPS+1 */
+            /*         #endif */
+            /*         #if 'vdw' in INTERACTION_FLAGS[I][J] */
+            /*             #if ROUND == 'Loop' */
+            gmx_fjsp_load_2pair_swizzle_v2r8(vdwparam+vdwioffset{I}+vdwjidx{J}A,
+                                         vdwparam+vdwioffset{I}+vdwjidx{J}B,&c6_{I}{J},&c12_{I}{J});
+            /*             #else */
+            gmx_fjsp_load_1pair_swizzle_v2r8(vdwparam+vdwioffset{I}+vdwjidx{J}A,&c6_{I}{J},&c12_{I}{J});
+            /*             #endif */
+            /*         #endif */
+            /*     #endif */
+
+            /*     #if 'table' in INTERACTION_FLAGS[I][J] */
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r{I}{J},vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            /*         #define INNERFLOPS INNERFLOPS+4                          */
+            /*         #if 'Table' in KERNEL_ELEC and 'Table' in KERNEL_VDW     */
+            /*             ## 3 tables, 4 data per point: multiply index by 12 */
+            vfconv.i[0]     *= 12;
+            vfconv.i[1]     *= 12;
+            /*         #elif 'Table' in KERNEL_ELEC                             */
+            /*             ## 1 table, 4 data per point: multiply index by 4   */
+            vfconv.i[0]     *= 4;
+            vfconv.i[1]     *= 4;
+            /*         #elif 'Table' in KERNEL_VDW                              */
+            /*             ## 2 tables, 4 data per point: multiply index by 8  */
+            vfconv.i[0]     *= 8;
+            vfconv.i[1]     *= 8;
+            /*         #endif                                                   */
+            /*     #endif */
+
+            /*     ## ELECTROSTATIC INTERACTIONS */
+            /*     #if 'electrostatics' in INTERACTION_FLAGS[I][J] */
+
+            /*         #if KERNEL_ELEC=='Coulomb' */
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq{I}{J},rinv{I}{J});
+            /*             #define INNERFLOPS INNERFLOPS+1 */
+            /*             #if 'Force' in KERNEL_VF */
+            felec            = _fjsp_mul_v2r8(velec,rinvsq{I}{J});
+            /*                 #define INNERFLOPS INNERFLOPS+2 */
+            /*             #endif */
+
+            /*         #elif KERNEL_ELEC=='ReactionField' */
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            /*             #if 'Potential' in KERNEL_VF */
+            velec            = _fjsp_mul_v2r8(qq{I}{J},_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq{I}{J},rinv{I}{J}),crf));
+            /*                 #define INNERFLOPS INNERFLOPS+4 */
+            /*             #endif */
+            /*             #if 'Force' in KERNEL_VF */
+            felec            = _fjsp_mul_v2r8(qq{I}{J},_fjsp_msub_v2r8(rinv{I}{J},rinvsq{I}{J},krf2));
+            /*                 #define INNERFLOPS INNERFLOPS+3 */
+            /*             #endif */
+
+            /*         #elif KERNEL_ELEC=='GeneralizedBorn' */
+
+            /* GENERALIZED BORN AND COULOMB ELECTROSTATICS */
+            isaprod          = _fjsp_mul_v2r8(isai{I},isaj{J});
+            gbqqfactor       = _fjsp_neg_v2r8(_fjsp_mul_v2r8(qq{I}{J},_fjsp_mul_v2r8(isaprod,gbinvepsdiff)));
+            gbscale          = _fjsp_mul_v2r8(isaprod,gbtabscale);
+            /*             #define INNERFLOPS INNERFLOPS+5 */
+
+            /* Calculate generalized born table index - this is a separate table from the normal one,
+             * but we use the same procedure by multiplying r with scale and truncating to integer.
+             */
+            rt               = _fjsp_mul_v2r8(r{I}{J},gbscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            gbeps            = _fjsp_sub_v2r8(rt,_fjsp_xtod_v2r8(itab_tmp));
+            _fjsp_store_v2r8(&gbconv.simd,itab_tmp);
+
+            Y                = _fjsp_load_v2r8( gbtab + 4*gbconv.i[0] );
+            /*             #if ROUND == 'Loop' */
+            F                = _fjsp_load_v2r8( gbtab + 4*gbconv.i[1] );
+            /*             #else */
+            F                = _fjsp_setzero_v2r8();
+            /*             #endif */
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( gbtab + 4*gbconv.i[0] +2);
+            /*             #if ROUND == 'Loop' */
+            H                = _fjsp_load_v2r8( gbtab + 4*gbconv.i[1] +2);
+            /*             #else */
+            H                = _fjsp_setzero_v2r8();
+            /*             #endif */
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(gbeps,_fjsp_madd_v2r8(gbeps,H,G),F);
+            VV               = _fjsp_madd_v2r8(gbeps,Fp,Y);
+            vgb              = _fjsp_mul_v2r8(gbqqfactor,VV);
+            /*             #define INNERFLOPS INNERFLOPS+10 */
+
+            /*             #if 'Force' in KERNEL_VF */
+            twogbeps         = _fjsp_add_v2r8(gbeps,gbeps);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twogbeps,H,G),gbeps,Fp);
+            fgb              = _fjsp_mul_v2r8(gbqqfactor,_fjsp_mul_v2r8(FF,gbscale));
+            dvdatmp          = _fjsp_mul_v2r8(minushalf,_fjsp_madd_v2r8(fgb,r{I}{J},vgb));
+            dvdasum          = _fjsp_add_v2r8(dvdasum,dvdatmp);
+            /*             #if ROUND == 'Loop' */
+            gmx_fjsp_increment_2real_swizzle_v2r8(dvda+jnrA,dvda+jnrB,_fjsp_mul_v2r8(dvdatmp,_fjsp_mul_v2r8(isaj{J},isaj{J})));
+            /*             #else */
+            gmx_fjsp_increment_1real_v2r8(dvda+jnrA,_fjsp_mul_v2r8(dvdatmp,_fjsp_mul_v2r8(isaj{J},isaj{J})));
+            /*             #endif */
+            /*                 #define INNERFLOPS INNERFLOPS+13 */
+            /*             #endif */
+            velec            = _fjsp_mul_v2r8(qq{I}{J},rinv{I}{J});
+            /*                 #define INNERFLOPS INNERFLOPS+1 */
+            /*             #if 'Force' in KERNEL_VF */
+            felec            = _fjsp_mul_v2r8(_fjsp_msub_v2r8(velec,rinv{I}{J},fgb),rinv{I}{J});
+            /*                 #define INNERFLOPS INNERFLOPS+3 */
+            /*             #endif */
+
+            /*         #elif KERNEL_ELEC=='Ewald' */
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r{I}{J},ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            /*             #define INNERFLOPS INNERFLOPS+4 */
+            /*             #if 'Potential' in KERNEL_VF or KERNEL_MOD_ELEC=='PotentialSwitch' */
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            /*                 #if ROUND == 'Loop' */
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            /*                 #else */
+            ewtabD           = _fjsp_setzero_v2r8();
+            /*                 #endif */
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            /*                 #if ROUND == 'Loop' */
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            /*                 #else */
+            ewtabFn          = _fjsp_setzero_v2r8();
+            /*                 #endif */
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            /*                 #define INNERFLOPS INNERFLOPS+2 */
+            /*                 #if KERNEL_MOD_ELEC=='PotentialShift' */            
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq{I}{J},_fjsp_sub_v2r8(_fjsp_sub_v2r8(rinv{I}{J},sh_ewald),velec));
+            /*                     #define INNERFLOPS INNERFLOPS+7 */
+            /*                 #else */
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq{I}{J},_fjsp_sub_v2r8(rinv{I}{J},velec));
+            /*                     #define INNERFLOPS INNERFLOPS+6 */
+            /*                 #endif */
+            /*                 #if 'Force' in KERNEL_VF */
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq{I}{J},rinv{I}{J}),_fjsp_sub_v2r8(rinvsq{I}{J},felec));
+            /*                      #define INNERFLOPS INNERFLOPS+3 */
+            /*                 #endif */
+            /*             #elif KERNEL_VF=='Force' */
+            /*                 #if ROUND == 'Loop' */
+            gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
+                                         &ewtabF,&ewtabFn);
+            /*                 #else */
+            gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
+            /*                 #endif */
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq{I}{J},rinv{I}{J}),_fjsp_sub_v2r8(rinvsq{I}{J},felec));
+            /*                 #define INNERFLOPS INNERFLOPS+7 */
+            /*             #endif */
+
+            /*         #elif KERNEL_ELEC=='CubicSplineTable' */
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            /*             #if ROUND == 'Loop' */
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+            /*             #else */
+            F                = _fjsp_setzero_v2r8();
+            /*             #endif */
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            /*             #if ROUND == 'Loop' */
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
+            /*             #else */
+            H                = _fjsp_setzero_v2r8();
+            /*             #endif */
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            /*             #define INNERFLOPS INNERFLOPS+4 */
+            /*             #if 'Potential' in KERNEL_VF */
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            velec            = _fjsp_mul_v2r8(qq{I}{J},VV);
+            /*                 #define INNERFLOPS INNERFLOPS+3 */
+            /*             #endif */
+            /*             #if 'Force' in KERNEL_VF */
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq{I}{J},FF),_fjsp_mul_v2r8(vftabscale,rinv{I}{J})));
+            /*                 #define INNERFLOPS INNERFLOPS+7 */
+            /*             #endif */
+            /*         #endif */
+            /*         ## End of check for electrostatics interaction forms */
+            /*     #endif */
+            /*     ## END OF ELECTROSTATIC INTERACTION CHECK FOR PAIR I-J */
+
+            /*     #if 'vdw' in INTERACTION_FLAGS[I][J] */
+
+            /*         #if KERNEL_VDW=='LennardJones' */
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq{I}{J},rinvsq{I}{J}),rinvsq{I}{J});
+            /*             #define INNERFLOPS INNERFLOPS+2 */
+            /*             #if 'Potential' in KERNEL_VF or KERNEL_MOD_VDW=='PotentialSwitch' */
+            vvdw6            = _fjsp_mul_v2r8(c6_{I}{J},rinvsix);
+            vvdw12           = _fjsp_mul_v2r8(c12_{I}{J},_fjsp_mul_v2r8(rinvsix,rinvsix));
+            /*                 #define INNERFLOPS INNERFLOPS+3 */
+            /*                 #if KERNEL_MOD_VDW=='PotentialShift' */
+            vvdw             = _fjsp_msub_v2r8(_fjsp_nmsub_v2r8(c12_{I}{J},_fjsp_mul_v2r8(sh_vdw_invrcut6,sh_vdw_invrcut6),vvdw12),one_twelfth,
+                                           _fjsp_mul_v2r8(_fjsp_nmsub_v2r8( c6_{I}{J},sh_vdw_invrcut6,vvdw6),one_sixth));
+            /*                     #define INNERFLOPS INNERFLOPS+8 */
+            /*                 #else */
+            vvdw             = _fjsp_msub_v2r8( vvdw12,one_twelfth, _fjsp_mul_v2r8(vvdw6,one_sixth) );
+            /*                     #define INNERFLOPS INNERFLOPS+3 */
+            /*                 #endif */
+            /*                 ## Check for force inside potential check, i.e. this means we already did the potential part */
+            /*                 #if 'Force' in KERNEL_VF */
+            fvdw             = _fjsp_mul_v2r8(_fjsp_sub_v2r8(vvdw12,vvdw6),rinvsq{I}{J});
+            /*                     #define INNERFLOPS INNERFLOPS+2 */
+            /*                 #endif */
+            /*             #elif KERNEL_VF=='Force' */
+            /*                 ## Force-only LennardJones makes it possible to save 1 flop (they do add up...) */
+            fvdw             = _fjsp_mul_v2r8(_fjsp_msub_v2r8(c12_{I}{J},rinvsix,c6_{I}{J}),_fjsp_mul_v2r8(rinvsix,rinvsq{I}{J}));
+            /*                 #define INNERFLOPS INNERFLOPS+4 */
+            /*             #endif */
+
+            /*         #elif KERNEL_VDW=='CubicSplineTable' */
+
+            /* CUBIC SPLINE TABLE DISPERSION */
+            /*             #if 'Table' in KERNEL_ELEC */
+            vfconv.i[0]       += 4;
+            vfconv.i[1]       += 4;
+            /*             #endif                     */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            /*             #if ROUND == 'Loop' */
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+            /*             #else */
+            F                = _fjsp_setzero_v2r8();
+            /*             #endif */
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 2 );
+            /*             #if ROUND == 'Loop' */
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 2 );
+            /*             #else */
+            H                = _fjsp_setzero_v2r8();
+            /*             #endif */
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+            /*             #define INNERFLOPS INNERFLOPS+4 */
+            /*             #if 'Potential' in KERNEL_VF */
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            vvdw6            = _fjsp_mul_v2r8(c6_{I}{J},VV);
+            /*                 #define INNERFLOPS INNERFLOPS+3 */
+            /*             #endif */
+            /*             #if 'Force' in KERNEL_VF */
+            FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+            fvdw6            = _fjsp_mul_v2r8(c6_{I}{J},FF);
+            /*                 #define INNERFLOPS INNERFLOPS+4 */
+            /*             #endif */
+
+            /* CUBIC SPLINE TABLE REPULSION */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 4 );
+            /*             #if ROUND == 'Loop' */
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 4 );
+            /*             #else */
+            F                = _fjsp_setzero_v2r8();
+            /*             #endif */
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 6 );
+            /*             #if ROUND == 'Loop' */
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 6 );
+            /*             #else */
+            H                = _fjsp_setzero_v2r8();
+            /*             #endif */
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+            /*             #define INNERFLOPS INNERFLOPS+4 */
+            /*             #if 'Potential' in KERNEL_VF */
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            vvdw12           = _fjsp_mul_v2r8(c12_{I}{J},VV);
+            /*                 #define INNERFLOPS INNERFLOPS+3 */
+            /*             #endif */
+            /*             #if 'Force' in KERNEL_VF */
+            FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+            fvdw12           = _fjsp_mul_v2r8(c12_{I}{J},FF);
+            /*                 #define INNERFLOPS INNERFLOPS+5 */
+            /*             #endif */
+            /*             #if 'Potential' in KERNEL_VF */
+            vvdw             = _fjsp_add_v2r8(vvdw12,vvdw6);
+            /*                 #define INNERFLOPS INNERFLOPS+1 */
+            /*             #endif */
+            /*             #if 'Force' in KERNEL_VF */
+            fvdw             = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_add_v2r8(fvdw6,fvdw12),_fjsp_mul_v2r8(vftabscale,rinv{I}{J})));
+            /*                 #define INNERFLOPS INNERFLOPS+4 */
+            /*             #endif */
+            /*         #endif */
+            /*         ## End of check for vdw interaction forms */
+            /*     #endif */
+            /*     ## END OF VDW INTERACTION CHECK FOR PAIR I-J */
+
+            /*     #if 'switch' in INTERACTION_FLAGS[I][J] */
+            d                = _fjsp_sub_v2r8(r{I}{J},rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+            /*         #define INNERFLOPS INNERFLOPS+10 */
+
+            /*         #if 'Force' in KERNEL_VF */
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+            /*             #define INNERFLOPS INNERFLOPS+5 */
+            /*         #endif */
+
+            /* Evaluate switch function */
+            /*         #if 'Force' in KERNEL_VF */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            /*             #if 'electrostatics' in INTERACTION_FLAGS[I][J] and KERNEL_MOD_ELEC=='PotentialSwitch' */
+            felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv{I}{J},_fjsp_mul_v2r8(velec,dsw)) );
+            /*                 #define INNERFLOPS INNERFLOPS+4 */
+            /*             #endif */
+            /*             #if 'vdw' in INTERACTION_FLAGS[I][J] and KERNEL_MOD_VDW=='PotentialSwitch' */
+            fvdw             = _fjsp_msub_v2r8( fvdw,sw , _fjsp_mul_v2r8(rinv{I}{J},_fjsp_mul_v2r8(vvdw,dsw)) );
+            /*                 #define INNERFLOPS INNERFLOPS+4 */
+            /*             #endif */
+            /*         #endif */
+            /*         #if 'Potential' in KERNEL_VF */
+            /*             #if 'electrostatics' in INTERACTION_FLAGS[I][J] and KERNEL_MOD_ELEC=='PotentialSwitch' */
+            velec            = _fjsp_mul_v2r8(velec,sw);
+            /*                 #define INNERFLOPS INNERFLOPS+1 */
+            /*             #endif */
+            /*             #if 'vdw' in INTERACTION_FLAGS[I][J] and KERNEL_MOD_VDW=='PotentialSwitch' */
+            vvdw             = _fjsp_mul_v2r8(vvdw,sw);
+            /*                 #define INNERFLOPS INNERFLOPS+1 */
+            /*             #endif */
+            /*         #endif */
+            /*     #endif */
+            /*     #if 'exactcutoff' in INTERACTION_FLAGS[I][J] */
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq{I}{J},rcutoff2);
+            /*         #define INNERFLOPS INNERFLOPS+1 */
+            /*     #endif */
+
+            /*     #if 'Potential' in KERNEL_VF */
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            /*         #if 'electrostatics' in INTERACTION_FLAGS[I][J] */
+            /*             #if 'exactcutoff' in INTERACTION_FLAGS[I][J] */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            /*                 #define INNERFLOPS INNERFLOPS+1 */
+            /*             #endif                                       */
+            /*             #if ROUND == 'Epilogue' */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            /*             #endif */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+            /*             #define INNERFLOPS INNERFLOPS+1 */
+            /*             #if KERNEL_ELEC=='GeneralizedBorn' */
+            /*             #if 'exactcutoff' in INTERACTION_FLAGS[I][J] */
+            vgb              = _fjsp_and_v2r8(vgb,cutoff_mask);
+            /*                 #define INNERFLOPS INNERFLOPS+1 */
+            /*             #endif                                       */
+            /*             #if ROUND == 'Epilogue' */
+            vgb              = _fjsp_unpacklo_v2r8(vgb,_fjsp_setzero_v2r8());
+            /*             #endif */
+            vgbsum           = _fjsp_add_v2r8(vgbsum,vgb);
+            /*                 #define INNERFLOPS INNERFLOPS+1 */
+            /*             #endif */
+            /*         #endif */
+            /*         #if 'vdw' in INTERACTION_FLAGS[I][J] */
+            /*             #if 'exactcutoff' in INTERACTION_FLAGS[I][J] */
+            vvdw             = _fjsp_and_v2r8(vvdw,cutoff_mask);
+            /*                 #define INNERFLOPS INNERFLOPS+1 */
+            /*             #endif                                       */
+            /*             #if ROUND == 'Epilogue' */
+            vvdw             = _fjsp_unpacklo_v2r8(vvdw,_fjsp_setzero_v2r8());
+            /*             #endif */
+            vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
+            /*             #define INNERFLOPS INNERFLOPS+1 */
+            /*         #endif */
+            /*     #endif */
+
+            /*     #if 'Force' in KERNEL_VF */
+
+            /*         #if 'electrostatics' in INTERACTION_FLAGS[I][J] and 'vdw' in INTERACTION_FLAGS[I][J] */
+            fscal            = _fjsp_add_v2r8(felec,fvdw);
+            /*             #define INNERFLOPS INNERFLOPS+1 */
+            /*         #elif 'electrostatics' in INTERACTION_FLAGS[I][J] */
+            fscal            = felec;
+            /*         #elif 'vdw' in INTERACTION_FLAGS[I][J] */
+            fscal            = fvdw;
+            /*        #endif */
+
+            /*             #if 'exactcutoff' in INTERACTION_FLAGS[I][J] */
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+            /*                 #define INNERFLOPS INNERFLOPS+1 */
+            /*             #endif                                       */
+
+            /*             #if ROUND == 'Epilogue' */
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+            /*             #endif */
+
+            /* ## Construction of vectorial force built into FMA instructions now */
+            /* #define INNERFLOPS INNERFLOPS+3      */
+            
+            /* Update vectorial force */
+            fix{I}             = _fjsp_madd_v2r8(dx{I}{J},fscal,fix{I});
+            fiy{I}             = _fjsp_madd_v2r8(dy{I}{J},fscal,fiy{I});
+            fiz{I}             = _fjsp_madd_v2r8(dz{I}{J},fscal,fiz{I});
+            /*             #define INNERFLOPS INNERFLOPS+6 */
+            
+            /* #if GEOMETRY_I == 'Particle'             */
+            /*     #if ROUND == 'Loop' */
+            gmx_fjsp_decrement_fma_1rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fscal,dx{I}{J},dy{I}{J},dz{I}{J});
+            /*     #else */
+            gmx_fjsp_decrement_fma_1rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fscal,dx{I}{J},dy{I}{J},dz{I}{J});
+            /*     #endif */
+            /*     #define INNERFLOPS INNERFLOPS+3      */
+            /* #else                                    */
+            fjx{J}             = _fjsp_madd_v2r8(dx{I}{J},fscal,fjx{J});
+            fjy{J}             = _fjsp_madd_v2r8(dy{I}{J},fscal,fjy{J});
+            fjz{J}             = _fjsp_madd_v2r8(dz{I}{J},fscal,fjz{J});
+            /*     #define INNERFLOPS INNERFLOPS+3      */
+            /* #endif                                   */
+
+            /*     #endif */
+
+            /*     #if 'exactcutoff' in INTERACTION_FLAGS[I][J] */
+            /*         #if 0    ## This and next two lines is a hack to maintain indentation in template file */
+            {
+                /*     #endif */
+            }
+            /*     #endif */
+            /*    ## End of check for the interaction being outside the cutoff */
+
+            /* #endfor */
+            /* ## End of loop over i-j interaction pairs */
+
+            /* #if 'Water' in GEOMETRY_I and GEOMETRY_J == 'Particle' */
+            /*     #if ROUND == 'Loop' */
+            gmx_fjsp_decrement_1rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0);
+            /*     #else */
+            gmx_fjsp_decrement_1rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0);
+            /*     #endif */
+            /*     #define INNERFLOPS INNERFLOPS+3      */
+            /* #elif GEOMETRY_J == 'Water3'             */
+            /*     #if ROUND == 'Loop' */
+            gmx_fjsp_decrement_3rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+            /*     #else */
+            gmx_fjsp_decrement_3rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+            /*     #endif */
+            /*     #define INNERFLOPS INNERFLOPS+9      */
+            /* #elif GEOMETRY_J == 'Water4'             */
+            /*     #if 0 in PARTICLES_J                 */
+            /*         #if ROUND == 'Loop' */
+            gmx_fjsp_decrement_4rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+            /*         #else */
+            gmx_fjsp_decrement_4rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+            /*         #endif */
+            /*         #define INNERFLOPS INNERFLOPS+12 */
+            /*     #else                                */
+            /*         #if ROUND == 'Loop' */
+            gmx_fjsp_decrement_3rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA+DIM,f+j_coord_offsetB+DIM,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+            /*         #else */
+            gmx_fjsp_decrement_3rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA+DIM,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+            /*         #endif */
+            /*         #define INNERFLOPS INNERFLOPS+9  */
+            /*     #endif                               */
+            /* #endif                                   */
+
+            /* Inner loop uses {INNERFLOPS} flops */
+        }
+
+        /* #endfor */
+
+        /* End of innermost loop */
+
+        /* #if 'Force' in KERNEL_VF */
+        /*     #if GEOMETRY_I == 'Particle'            */
+        gmx_fjsp_update_iforce_1atom_swizzle_v2r8(fix0,fiy0,fiz0,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+        /*         #define OUTERFLOPS OUTERFLOPS+6     */
+        /*     #elif GEOMETRY_I == 'Water3'            */
+        gmx_fjsp_update_iforce_3atom_swizzle_v2r8(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+        /*         #define OUTERFLOPS OUTERFLOPS+18    */
+        /*     #elif GEOMETRY_I == 'Water4'            */
+        /*         #if 0 in PARTICLES_I                */
+        gmx_fjsp_update_iforce_4atom_swizzle_v2r8(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+        /*             #define OUTERFLOPS OUTERFLOPS+24    */
+        /*         #else                               */
+        gmx_fjsp_update_iforce_3atom_swizzle_v2r8(fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+                                              f+i_coord_offset+DIM,fshift+i_shift_offset);
+        /*             #define OUTERFLOPS OUTERFLOPS+18    */
+        /*         #endif                              */
+        /*     #endif                                  */
+        /* #endif                                      */
+
+        /* #if 'Potential' in KERNEL_VF */
+        ggid                        = gid[iidx];
+        /* Update potential energies */
+        /*     #if KERNEL_ELEC != 'None' */
+        gmx_fjsp_update_1pot_v2r8(velecsum,kernel_data->energygrp_elec+ggid);
+        /*         #define OUTERFLOPS OUTERFLOPS+1 */
+        /*     #endif */
+        /*     #if 'GeneralizedBorn' in KERNEL_ELEC */
+        gmx_fjsp_update_1pot_v2r8(vgbsum,kernel_data->energygrp_polarization+ggid);
+        /*         #define OUTERFLOPS OUTERFLOPS+1 */
+        /*     #endif */
+        /*     #if KERNEL_VDW != 'None' */
+        gmx_fjsp_update_1pot_v2r8(vvdwsum,kernel_data->energygrp_vdw+ggid);
+        /*         #define OUTERFLOPS OUTERFLOPS+1 */
+        /*     #endif */
+        /* #endif */
+        /*     #if 'GeneralizedBorn' in KERNEL_ELEC and 'Force' in KERNEL_VF */
+        dvdasum = _fjsp_mul_v2r8(dvdasum, _fjsp_mul_v2r8(isai{I},isai{I}));
+        gmx_fjsp_update_1pot_v2r8(dvdasum,dvda+inr);
+        /*     #endif */
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses {OUTERFLOPS} flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+    /* ## NB: This is not important, it just affects the flopcount. However, since our preprocessor is */
+    /* ## primitive and replaces aggressively even in strings inside these directives, we need to      */
+    /* ## assemble the main part of the name (containing KERNEL/ELEC/VDW) directly in the source.      */
+    /* #if GEOMETRY_I == 'Water3'            */
+    /*     #define ISUFFIX '_W3'             */
+    /* #elif GEOMETRY_I == 'Water4'          */
+    /*     #define ISUFFIX '_W4'             */
+    /* #else                                 */
+    /*     #define ISUFFIX ''                */
+    /* #endif                                */
+    /* #if GEOMETRY_J == 'Water3'            */
+    /*     #define JSUFFIX 'W3'              */
+    /* #elif GEOMETRY_J == 'Water4'          */
+    /*     #define JSUFFIX 'W4'              */
+    /* #else                                 */
+    /*     #define JSUFFIX ''                */
+    /* #endif                                */
+    /* #if 'PotentialAndForce' in KERNEL_VF  */
+    /*     #define VFSUFFIX  '_VF'           */
+    /* #elif 'Potential' in KERNEL_VF        */
+    /*     #define VFSUFFIX '_V'             */
+    /* #else                                 */
+    /*     #define VFSUFFIX '_F'             */
+    /* #endif                                */
+
+    /* #if KERNEL_ELEC != 'None' and KERNEL_VDW != 'None' */
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW{ISUFFIX}{JSUFFIX}{VFSUFFIX},outeriter*{OUTERFLOPS} + inneriter*{INNERFLOPS});
+    /* #elif KERNEL_ELEC != 'None' */
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC{ISUFFIX}{JSUFFIX}{VFSUFFIX},outeriter*{OUTERFLOPS} + inneriter*{INNERFLOPS});
+    /* #else */
+    inc_nrnb(nrnb,eNR_NBKERNEL_VDW{ISUFFIX}{JSUFFIX}{VFSUFFIX},outeriter*{OUTERFLOPS} + inneriter*{INNERFLOPS});
+    /* #endif  */
+}
index afe5f56351bd72b9186b25cd67aeeb125dcc3599..ac5fe893d4bce8455a49506e2f122680893b7e1f 100644 (file)
@@ -96,6 +96,9 @@
 #if (defined GMX_CPU_ACCELERATION_X86_AVX_256 && defined GMX_DOUBLE)
 #    include "nb_kernel_avx_256_double/nb_kernel_avx_256_double.h"
 #endif
+#if (defined GMX_CPU_ACCELERATION_SPARC64_HPC_ACE && defined GMX_DOUBLE)
+#    include "nb_kernel_sparc64_hpc_ace_double/nb_kernel_sparc64_hpc_ace_double.h"
+#endif
 
 
 #ifdef GMX_THREAD_MPI
@@ -148,6 +151,9 @@ gmx_nonbonded_setup(FILE *         fplog,
 #endif
 #if (defined GMX_CPU_ACCELERATION_X86_AVX_256 && defined GMX_DOUBLE)
                 nb_kernel_list_add_kernels(kernellist_avx_256_double, kernellist_avx_256_double_size);
+#endif
+#if (defined GMX_CPU_ACCELERATION_SPARC64_HPC_ACE && defined GMX_DOUBLE)
+                nb_kernel_list_add_kernels(kernellist_sparc64_hpc_ace_double,kernellist_sparc64_hpc_ace_double_size);
 #endif
                 ; /* empty statement to avoid a completely empty block */
             }
@@ -213,6 +219,10 @@ gmx_nonbonded_set_kernel_pointers(FILE *log, t_nblist *nl)
 #if (defined GMX_CPU_ACCELERATION_X86_SSE4_1 && defined GMX_DOUBLE)
         /* No padding - see comment above */
         { "sse4_1_double", 1 },
+#endif
+#if (defined GMX_CPU_ACCELERATION_SPARC64_HPC_ACE && defined GMX_DOUBLE)
+        /* No padding - see comment above */
+        { "sparc64_hpc_ace_double", 1 },
 #endif
         { "c", 1 },
     };
index 06b389d5cedcf9731ae7bbf0c8091bb04fab9fbf..6aa852f00e2667c81970a6a09dd36c426ba91420 100644 (file)
@@ -10,7 +10,7 @@ set(THREAD_MPI_LIB_SOURCE
     errhandler.c    p2p_send_recv.c type.c
     event.c         p2p_wait.c      tmpi_malloc.c
     gather.c        profile.c
-    group.c         numa_malloc.c   )
+    group.c         numa_malloc.c   atomic.c)
 
 
 if (THREAD_PTHREADS)
diff --git a/src/gromacs/gmxlib/thread_mpi/atomic.c b/src/gromacs/gmxlib/thread_mpi/atomic.c
new file mode 100644 (file)
index 0000000..b9f5e0e
--- /dev/null
@@ -0,0 +1,234 @@
+/*
+   This source code file is part of thread_mpi.
+   Written by Sander Pronk, Erik Lindahl, and possibly others.
+
+   Copyright (c) 2009, Sander Pronk, Erik Lindahl.
+   All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are met:
+   1) Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+   2) Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions and the following disclaimer in the
+   documentation and/or other materials provided with the distribution.
+   3) Neither the name of the copyright holders nor the
+   names of its contributors may be used to endorse or promote products
+   derived from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY US ''AS IS'' AND ANY
+   EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+   WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+   DISCLAIMED. IN NO EVENT SHALL WE BE LIABLE FOR ANY
+   DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+   (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+   LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+   ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+   If you want to redistribute modifications, please consider that
+   scientific software is very special. Version control is crucial -
+   bugs must be traceable. We will be happy to consider code for
+   inclusion in the official distribution, but derived work should not
+   be called official thread_mpi. Details are found in the README & COPYING
+   files.
+ */
+
+#include "impl.h"
+
+/* This file is only needed when no intrinsic atomic operations are present. */
+#ifdef TMPI_NO_ATOMICS
+
+/** System mutex used for locking to guarantee atomicity */
+static tMPI_Thread_mutex_t tMPI_Atomic_mutex = TMPI_THREAD_MUTEX_INITIALIZER;
+
+struct tMPI_Spinlock
+{
+    tMPI_Thread_mutex_t *lock;
+};
+
+int tMPI_Atomic_get(const tMPI_Atomic_t *a)
+{
+    int ret;
+    tMPI_Thread_mutex_lock(&tMPI_Atomic_mutex);
+    ret = a->value;
+    tMPI_Thread_mutex_unlock(&tMPI_Atomic_mutex);
+    return ret;
+}
+
+void tMPI_Atomic_set(tMPI_Atomic_t *a, int value)
+{
+    tMPI_Thread_mutex_lock(&tMPI_Atomic_mutex);
+    a->value = value;
+    tMPI_Thread_mutex_unlock(&tMPI_Atomic_mutex);
+}
+
+
+void* tMPI_Atomic_ptr_get(const tMPI_Atomic_ptr_t *a)
+{
+    void* ret;
+    tMPI_Thread_mutex_lock(&tMPI_Atomic_mutex);
+    ret = a->value;
+    tMPI_Thread_mutex_unlock(&tMPI_Atomic_mutex);
+    return ret;
+}
+
+void tMPI_Atomic_ptr_set(tMPI_Atomic_ptr_t *a, void *value)
+{
+    tMPI_Thread_mutex_lock(&tMPI_Atomic_mutex);
+    a->value = value;
+    tMPI_Thread_mutex_unlock(&tMPI_Atomic_mutex);
+}
+
+int tMPI_Atomic_add_return(tMPI_Atomic_t *a, int i)
+{
+    int t;
+    tMPI_Thread_mutex_lock(&tMPI_Atomic_mutex);
+    t        = a->value + i;
+    a->value = t;
+    tMPI_Thread_mutex_unlock(&tMPI_Atomic_mutex);
+    return t;
+}
+
+int tMPI_Atomic_fetch_add(tMPI_Atomic_t *a, int i)
+{
+    int old_value;
+
+    tMPI_Thread_mutex_lock(&tMPI_Atomic_mutex);
+    old_value = a->value;
+    a->value  = old_value + i;
+    tMPI_Thread_mutex_unlock(&tMPI_Atomic_mutex);
+    return old_value;
+}
+
+int tMPI_Atomic_cas(tMPI_Atomic_t *a, int old_val, int new_val)
+{
+    int t = 0;
+
+    tMPI_Thread_mutex_lock(&tMPI_Atomic_mutex);
+    if (a->value == old_val)
+    {
+        a->value = new_val;
+        t        = 1;
+    }
+    tMPI_Thread_mutex_unlock(&tMPI_Atomic_mutex);
+    return t;
+}
+
+
+int tMPI_Atomic_ptr_cas(tMPI_Atomic_ptr_t * a, void *old_val, void *new_val)
+{
+    int t = 0;
+
+    tMPI_Thread_mutex_lock(&tMPI_Atomic_mutex);
+    if (a->value == old_val)
+    {
+        a->value = new_val;
+        t        = 1;
+    }
+    tMPI_Thread_mutex_unlock(&tMPI_Atomic_mutex);
+    return t;
+}
+
+int tMPI_Atomic_swap(tMPI_Atomic_t *a, int b)
+{
+    int ret;
+    tMPI_Thread_mutex_lock(&tMPI_Atomic_mutex);
+    ret      = a->value;
+    a->value = b;
+    tMPI_Thread_mutex_unlock(&tMPI_Atomic_mutex);
+
+    return ret;
+}
+
+void *tMPI_Atomic_ptr_swap(tMPI_Atomic_ptr_t *a, void *b)
+{
+    void *ret;
+
+    tMPI_Thread_mutex_lock(&tMPI_Atomic_mutex);
+    ret      = a->value;
+    a->value = b;
+    tMPI_Thread_mutex_unlock(&tMPI_Atomic_mutex);
+
+    return ret;
+}
+
+
+void tMPI_Spinlock_init( tMPI_Spinlock_t *x)
+{
+    tMPI_Thread_mutex_lock(&tMPI_Atomic_mutex);
+    *x         = (tMPI_Spinlock_t)malloc(sizeof(tMPI_Spinlock_t));
+    (*x)->lock = (tMPI_Thread_mutex_t*)malloc(sizeof(tMPI_Thread_mutex_t));
+    tMPI_Thread_mutex_init((*x)->lock);
+    tMPI_Thread_mutex_unlock(&tMPI_Atomic_mutex);
+}
+
+/* NOTE: assumes atomic mutex is locked */
+static void tMPI_Spinlock_init_once(tMPI_Spinlock_t *x)
+{
+    tMPI_Thread_mutex_lock(&tMPI_Atomic_mutex);
+    if (!*x)
+    {
+        *x         = (tMPI_Spinlock_t)malloc(sizeof(tMPI_Spinlock_t));
+        (*x)->lock = (tMPI_Thread_mutex_t*)malloc(sizeof(tMPI_Thread_mutex_t));
+        tMPI_Thread_mutex_init((*x)->lock);
+    }
+    tMPI_Thread_mutex_unlock(&tMPI_Atomic_mutex);
+}
+
+
+void tMPI_Spinlock_lock( tMPI_Spinlock_t *x)
+{
+    tMPI_Spinlock_init_once(x);
+    tMPI_Thread_mutex_lock((*x)->lock);
+}
+
+void tMPI_Spinlock_unlock( tMPI_Spinlock_t *x)
+{
+    tMPI_Spinlock_init_once(x);
+    tMPI_Thread_mutex_unlock((*x)->lock);
+}
+
+int tMPI_Spinlock_trylock( tMPI_Spinlock_t *x)
+{
+    int ret;
+    tMPI_Spinlock_init_once(x);
+    ret = tMPI_Thread_mutex_trylock((*x)->lock);
+    return ret;
+}
+
+int tMPI_Spinlock_islocked(tMPI_Spinlock_t *x)
+{
+    int ret;
+    tMPI_Spinlock_init_once(x);
+    ret = tMPI_Thread_mutex_trylock((*x)->lock);
+    if (ret == 0)
+    {
+        tMPI_Thread_mutex_unlock((*x)->lock);
+        ret = 0;
+    }
+    else
+    {
+        ret = 1;
+    }
+
+    return ret;
+}
+
+
+void tMPI_Spinlock_wait(tMPI_Spinlock_t *x)
+{
+    tMPI_Spinlock_init_once(x);
+
+    tMPI_Spinlock_lock(x);
+    /* Got the lock now, so the waiting is over */
+    tMPI_Spinlock_unlock(x);
+}
+
+#else
+
+/* just to have some symbols */
+int _tMPI_Atomics = 1;
+
+#endif
index 42ef5213d024a537feb5357ec1e40f6b46022aa6..e1c5279897c64c0d8e79a20c89bb372f85c19664 100644 (file)
@@ -68,8 +68,8 @@ void tMPI_Barrier_init(tMPI_Barrier_t *barrier, int count)
 
 int tMPI_Barrier_wait(tMPI_Barrier_t *barrier)
 {
-    int    cycle;
-    int    status;
+    int cycle;
+    int status;
 
     /* We don't need to lock or use atomic ops here, since the cycle index
      * cannot change until after the last thread has performed the check
@@ -84,11 +84,11 @@ int tMPI_Barrier_wait(tMPI_Barrier_t *barrier)
     /* Decrement the count atomically and check if it is zero.
      * This will only be true for the last thread calling us.
      */
-    if (tMPI_Atomic_add_return( &(barrier->count), -1 ) <= 0)
+    if (tMPI_Atomic_fetch_add( &(barrier->count), -1 ) <= 1)
     {
         tMPI_Atomic_memory_barrier();
         tMPI_Atomic_set(&(barrier->count), barrier->threshold);
-        tMPI_Atomic_add_return(&(barrier->cycle), 1);
+        tMPI_Atomic_fetch_add(&(barrier->cycle), 1);
 
         status = -1;
     }
index bd8acd3a67c1ca6d3ef00f72162c90f715cba192..75ae9c08c6d05f10cec75c94db78753f58a4d6f2 100644 (file)
@@ -87,8 +87,13 @@ int tMPI_Bcast(void* buffer, int count, tMPI_Datatype datatype, int root,
     if (myrank == root)
     {
         /* first set up the data */
-        tMPI_Post_multi(cev, myrank, 0, TMPI_BCAST_TAG, datatype,
-                        count*datatype->size, buffer, comm->grp.N-1, synct, -1);
+        ret = tMPI_Post_multi(cev, myrank, 0, TMPI_BCAST_TAG, datatype,
+                              count*datatype->size, buffer, comm->grp.N-1,
+                              synct, -1);
+        if (ret != TMPI_SUCCESS)
+        {
+            return ret;
+        }
         /* and wait until everybody is done copying */
         tMPI_Wait_for_others(cev, myrank);
     }
index 069db3f1d8958947cc1d1b139f79c90d3f103c3a..5434dc425d3297a1d0c429e9c08d3aa602a544bc 100644 (file)
 
 #ifdef USE_COLLECTIVE_COPY_BUFFER
 /* initialize a copy buffer */
-void tMPI_Copy_buffer_init(struct copy_buffer *cb, size_t size)
+int tMPI_Copy_buffer_init(struct copy_buffer *cb, size_t size)
 {
-    cb->buf  = tMPI_Malloc(size);
+    cb->buf = tMPI_Malloc(size);
+    if (cb->buf == NULL)
+    {
+        return TMPI_ERR_NO_MEM;
+    }
     cb->size = size;
+    return TMPI_SUCCESS;
 }
 
 /* destroy a copy buffer */
@@ -75,19 +80,28 @@ void tMPI_Copy_buffer_destroy(struct copy_buffer *cb)
     free(cb->buf);
 }
 
-void tMPI_Copy_buffer_list_init(struct copy_buffer_list *cbl, int Nbufs,
-                                size_t size)
+int tMPI_Copy_buffer_list_init(struct copy_buffer_list *cbl, int Nbufs,
+                               size_t size)
 {
     int i;
+    int ret;
 
     cbl->size     = size;
     cbl->cb_alloc = (struct copy_buffer*)
         tMPI_Malloc(sizeof(struct copy_buffer)*Nbufs);
+    if (cbl->cb_alloc == NULL)
+    {
+        return TMPI_ERR_NO_MEM;
+    }
     cbl->cb    = cbl->cb_alloc; /* the first one */
     cbl->Nbufs = Nbufs;
     for (i = 0; i < Nbufs; i++)
     {
-        tMPI_Copy_buffer_init( &(cbl->cb_alloc[i]), size );
+        ret = tMPI_Copy_buffer_init( &(cbl->cb_alloc[i]), size );
+        if (ret != TMPI_SUCCESS)
+        {
+            return ret;
+        }
         if (i < Nbufs-1)
         {
             cbl->cb_alloc[i].next = &(cbl->cb_alloc[i+1]);
@@ -97,6 +111,7 @@ void tMPI_Copy_buffer_list_init(struct copy_buffer_list *cbl, int Nbufs,
             cbl->cb_alloc[i].next = NULL;
         }
     }
+    return TMPI_SUCCESS;
 }
 
 void tMPI_Copy_buffer_list_destroy(struct copy_buffer_list *cbl)
@@ -115,8 +130,8 @@ struct copy_buffer *tMPI_Copy_buffer_list_get(struct copy_buffer_list *cbl)
     struct copy_buffer *ret = cbl->cb;
     if (!ret)
     {
-        fprintf(stderr, "out of copy buffers!!");
-        exit(1);
+        tMPI_Error(TMPI_COMM_WORLD, TMPI_ERR_COPY_NBUFFERS);
+        return NULL;
     }
     cbl->cb = ret->next;
 
@@ -138,20 +153,38 @@ void tMPI_Copy_buffer_list_return(struct copy_buffer_list *cbl,
 
 
 
-void tMPI_Coll_envt_init(struct coll_env_thread *met, int N)
+int tMPI_Coll_envt_init(struct coll_env_thread *met, int N)
 {
     tMPI_Atomic_set(&(met->current_sync), 0);
     tMPI_Atomic_set(&(met->n_remaining), 0);
-    met->buf       = (void**)tMPI_Malloc(sizeof(void*)*N);
-    met->bufsize   = (size_t*)tMPI_Malloc(sizeof(size_t)*N);
+    met->buf = (void**)tMPI_Malloc(sizeof(void*)*N);
+    if (met->buf == NULL)
+    {
+        return TMPI_ERR_NO_MEM;
+    }
+    met->bufsize = (size_t*)tMPI_Malloc(sizeof(size_t)*N);
+    if (met->bufsize == NULL)
+    {
+        return TMPI_ERR_NO_MEM;
+    }
     met->read_data = (tmpi_bool*)tMPI_Malloc(sizeof(tmpi_bool)*N);
+    if (met->read_data == NULL)
+    {
+        return TMPI_ERR_NO_MEM;
+    }
 #ifdef USE_COLLECTIVE_COPY_BUFFER
-    met->cpbuf    = (tMPI_Atomic_ptr_t*)tMPI_Malloc(sizeof(tMPI_Atomic_ptr_t)*N);
+    met->cpbuf = (tMPI_Atomic_ptr_t*)tMPI_Malloc(sizeof(tMPI_Atomic_ptr_t)*
+                                                 N);
+    if (met->read_data == NULL)
+    {
+        return TMPI_ERR_NO_MEM;
+    }
     met->cb       = NULL;
     met->using_cb = FALSE;
 #endif
     tMPI_Event_init( &(met->send_ev) );
     tMPI_Event_init( &(met->recv_ev) );
+    return TMPI_SUCCESS;
 }
 
 
@@ -166,19 +199,29 @@ void tMPI_Coll_envt_destroy(struct coll_env_thread *met)
 #endif
 }
 
-void tMPI_Coll_env_init(struct coll_env *cev, int N)
+int tMPI_Coll_env_init(struct coll_env *cev, int N)
 {
     int i;
+    int ret;
 
     cev->met = (struct coll_env_thread*)tMPI_Malloc(
                 sizeof(struct coll_env_thread)*N);
+    if (cev->met == NULL)
+    {
+        return TMPI_ERR_NO_MEM;
+    }
     cev->N = N;
     tMPI_Atomic_set(&(cev->coll.current_sync), 0);
     tMPI_Atomic_set(&(cev->coll.n_remaining), 0);
     for (i = 0; i < N; i++)
     {
-        tMPI_Coll_envt_init(&(cev->met[i]), N);
+        ret = tMPI_Coll_envt_init(&(cev->met[i]), N);
+        if (ret != TMPI_SUCCESS)
+        {
+            return ret;
+        }
     }
+    return TMPI_SUCCESS;
 }
 
 void tMPI_Coll_env_destroy(struct coll_env *cev)
@@ -192,7 +235,7 @@ void tMPI_Coll_env_destroy(struct coll_env *cev)
 }
 
 
-void tMPI_Coll_sync_init(struct coll_sync *csync, int N)
+int tMPI_Coll_sync_init(struct coll_sync *csync, int N)
 {
     int i;
 
@@ -201,10 +244,15 @@ void tMPI_Coll_sync_init(struct coll_sync *csync, int N)
     csync->N     = N;
 
     csync->events = (tMPI_Event*)tMPI_Malloc(sizeof(tMPI_Event)*N);
+    if (csync->events == NULL)
+    {
+        return TMPI_ERR_NO_MEM;
+    }
     for (i = 0; i < N; i++)
     {
         tMPI_Event_init( &(csync->events[i]) );
     }
+    return TMPI_SUCCESS;
 }
 
 void tMPI_Coll_sync_destroy(struct coll_sync *csync)
@@ -244,8 +292,11 @@ struct coll_env *tMPI_Get_cev(tMPI_Comm comm, int myrank, int *counter)
 #ifdef USE_COLLECTIVE_COPY_BUFFER
     if (cev->met[myrank].using_cb)
     {
-        N = tMPI_Event_wait( &(cev->met[myrank].send_ev));
-        tMPI_Event_process( &(cev->met[myrank].send_ev), 1);
+        if (cev->N > 1)
+        {
+            N = tMPI_Event_wait( &(cev->met[myrank].send_ev));
+            tMPI_Event_process( &(cev->met[myrank].send_ev), 1);
+        }
     }
 #endif
 #ifdef USE_COLLECTIVE_COPY_BUFFER
@@ -316,7 +367,7 @@ void tMPI_Mult_recv(tMPI_Comm comm, struct coll_env *cev, int rank,
                 /* we need to try checking the pointer again after we increase
                    the read counter, signaling that one more thread
                    is reading. */
-                tMPI_Atomic_add_return(&(cev->met[rank].buf_readcount), 1);
+                tMPI_Atomic_fetch_add(&(cev->met[rank].buf_readcount), 1);
                 /* a full memory barrier */
                 tMPI_Atomic_memory_barrier();
                 try_again_srcbuf = tMPI_Atomic_ptr_get(
@@ -335,7 +386,7 @@ void tMPI_Mult_recv(tMPI_Comm comm, struct coll_env *cev, int rank,
                     /* We tried again, and this time there was a copied buffer.
                        We use that, and indicate that we're not reading from the
                        regular buf. This case should be pretty rare.  */
-                    tMPI_Atomic_add_return(&(cev->met[rank].buf_readcount), -1);
+                    tMPI_Atomic_fetch_add(&(cev->met[rank].buf_readcount), -1);
                     tMPI_Atomic_memory_barrier_acq();
                     srcbuf = try_again_srcbuf;
                 }
@@ -360,7 +411,7 @@ void tMPI_Mult_recv(tMPI_Comm comm, struct coll_env *cev, int rank,
         {
             /* we decrement the read count; potentially releasing the buffer. */
             tMPI_Atomic_memory_barrier_rel();
-            tMPI_Atomic_add_return( &(cev->met[rank].buf_readcount), -1);
+            tMPI_Atomic_fetch_add( &(cev->met[rank].buf_readcount), -1);
         }
 #endif
     }
@@ -368,8 +419,8 @@ void tMPI_Mult_recv(tMPI_Comm comm, struct coll_env *cev, int rank,
     {
         int reta;
         tMPI_Atomic_memory_barrier_rel();
-        reta = tMPI_Atomic_add_return( &(cev->met[rank].n_remaining), -1);
-        if (reta <= 0)
+        reta = tMPI_Atomic_fetch_add( &(cev->met[rank].n_remaining), -1);
+        if (reta <= 1) /* n_remaining == 0 now. */
         {
             tMPI_Event_signal( &(cev->met[rank].send_ev) );
         }
@@ -401,9 +452,9 @@ void tMPI_Coll_root_xfer(tMPI_Comm comm, tMPI_Datatype sendtype,
     memcpy(recvbuf, sendbuf, sendsize);
 }
 
-void tMPI_Post_multi(struct coll_env *cev, int myrank, int index,
-                     int tag, tMPI_Datatype datatype, size_t bufsize,
-                     void *buf, int n_remaining, int synct, int dest)
+int tMPI_Post_multi(struct coll_env *cev, int myrank, int index,
+                    int tag, tMPI_Datatype datatype, size_t bufsize,
+                    void *buf, int n_remaining, int synct, int dest)
 {
     int i;
 #ifdef USE_COLLECTIVE_COPY_BUFFER
@@ -452,10 +503,13 @@ void tMPI_Post_multi(struct coll_env *cev, int myrank, int index,
         struct tmpi_thread *cur = tMPI_Get_current();
         /* copy the buffer locally. First allocate */
         cev->met[myrank].cb = tMPI_Copy_buffer_list_get( &(cur->cbl_multi) );
+        if (cev->met[myrank].cb == NULL)
+        {
+            return TMPI_ERR_COPY_NBUFFERS;
+        }
         if (cev->met[myrank].cb->size < bufsize)
         {
-            fprintf(stderr, "ERROR: cb size too small\n");
-            exit(1);
+            return TMPI_ERR_COPY_BUFFER_SIZE;
         }
         /* copy to the new buf */
         memcpy(cev->met[myrank].cb->buf, buf, bufsize);
@@ -467,6 +521,7 @@ void tMPI_Post_multi(struct coll_env *cev, int myrank, int index,
                             cev->met[myrank].cb->buf);
     }
 #endif
+    return TMPI_SUCCESS;
 }
 
 
@@ -477,38 +532,45 @@ void tMPI_Wait_for_others(struct coll_env *cev, int myrank)
     tMPI_Profile_wait_start(cur);
 #endif
 
-#ifdef USE_COLLECTIVE_COPY_BUFFER
-    if (!(cev->met[myrank].using_cb) )
-#endif
+    if (cev->N > 1)
     {
-        /* wait until everybody else is done copying the buffer */
-        tMPI_Event_wait( &(cev->met[myrank].send_ev));
-        tMPI_Event_process( &(cev->met[myrank].send_ev), 1);
-    }
 #ifdef USE_COLLECTIVE_COPY_BUFFER
-    else
-    {
-        /* wait until everybody else is done copying the original buffer.
-           We use atomic add-return because we want to be sure of coherency.
-           This wait is bound to be very short (otherwise it wouldn't
-           be double-buffering) so we always spin here. */
-        /*tMPI_Atomic_memory_barrier_rel();*/
-#if 0
-        while (!tMPI_Atomic_cas( &(cev->met[rank].buf_readcount), 0,
-                                 -100000))
+        if (!(cev->met[myrank].using_cb) )
 #endif
+        {
+            /* wait until everybody else is done copying the buffer */
+            tMPI_Event_wait( &(cev->met[myrank].send_ev));
+            tMPI_Event_process( &(cev->met[myrank].send_ev), 1);
+        }
+#ifdef USE_COLLECTIVE_COPY_BUFFER
+        else
+        {
+            /* wait until everybody else is done copying the original buffer.
+               This wait is bound to be very short (otherwise it wouldn't
+               be double-buffering) so we always spin here. */
 #if 0
-        while (tMPI_Atomic_add_return( &(cev->met[myrank].buf_readcount), 0)
-               != 0)
+            /* dummy compare-and-swap to a value that is non-zero. The
+               atomic read with barrier below is simpler, but we keep this
+               code here commented out for if there is ever a platform
+               where the simple read doesn't work because of, say, cache
+               coherency issues. */
+            while (!tMPI_Atomic_cas( &(cev->met[rank].buf_readcount), 0,
+                                     -100000))
 #endif
 #if 1
-        while (tMPI_Atomic_get( &(cev->met[rank].buf_readcount) ) > 0)
+            tMPI_Atomic_memory_barrier();         /* a full barrier to make
+                                                     sure that the sending
+                                                     doesn't interfere with the
+                                                     waiting */
+            while (tMPI_Atomic_get( &(cev->met[myrank].buf_readcount) ) > 0)
 #endif
-        {
+            {
+                tMPI_Atomic_memory_barrier_acq();
+            }
+            tMPI_Atomic_memory_barrier_acq();
         }
-        tMPI_Atomic_memory_barrier_acq();
-    }
 #endif
+    }
 #if defined(TMPI_PROFILE)
     tMPI_Profile_wait_stop(cur, TMPIWAIT_Coll_send);
 #endif
index 1aadef84bc02b4b9bfaa8d1cc9c18adc473e0a1b..162fda643474a7203122fcf323149a3619ed2bfe 100644 (file)
@@ -51,10 +51,10 @@ struct coll_env *tMPI_Get_cev(tMPI_Comm comm, int myrank, int *synct);
    synct       = the multicast sync number
    dest        = -1 for all theads, or a specific rank number.
  */
-void tMPI_Post_multi(struct coll_env *cev, int myrank, int index,
-                     int tag, tMPI_Datatype datatype,
-                     size_t bufsize, void *buf, int n_remaining,
-                     int synct, int dest);
+int tMPI_Post_multi(struct coll_env *cev, int myrank, int index,
+                    int tag, tMPI_Datatype datatype,
+                    size_t bufsize, void *buf, int n_remaining,
+                    int synct, int dest);
 
 /* transfer data from cev->met[rank] to recvbuf */
 void tMPI_Mult_recv(tMPI_Comm comm, struct coll_env *cev, int rank,
index 98c84e9758a778d3629c86efd17c6f7e2550201c..28464fa1ef924db5a5f0be0b7d956ad33ac17b37 100644 (file)
@@ -150,31 +150,53 @@ int tMPI_Comm_compare(tMPI_Comm comm1, tMPI_Comm comm2, int *result)
 }
 
 
-tMPI_Comm tMPI_Comm_alloc(tMPI_Comm parent, int N)
+int tMPI_Comm_alloc(tMPI_Comm *newcomm, tMPI_Comm parent, int N)
 {
-    struct tmpi_comm_ *ret;
+    struct tmpi_comm_ *retc;
     int                i;
+    int                ret;
 
-    ret            = (struct tmpi_comm_*)tMPI_Malloc(sizeof(struct tmpi_comm_));
-    ret->grp.peers = (struct tmpi_thread**)tMPI_Malloc(
+    retc = (struct tmpi_comm_*)tMPI_Malloc(sizeof(struct tmpi_comm_));
+    if (retc == NULL)
+    {
+        return TMPI_ERR_NO_MEM;
+    }
+
+    retc->grp.peers = (struct tmpi_thread**)tMPI_Malloc(
                 sizeof(struct tmpi_thread*)*Nthreads);
-    ret->grp.N = N;
+    if (retc->grp.peers == NULL)
+    {
+        return TMPI_ERR_NO_MEM;
+    }
+    retc->grp.N = N;
 
-    tMPI_Thread_mutex_init( &(ret->comm_create_lock) );
-    tMPI_Thread_cond_init( &(ret->comm_create_prep) );
-    tMPI_Thread_cond_init( &(ret->comm_create_finish) );
+    ret = tMPI_Thread_mutex_init( &(retc->comm_create_lock) );
+    if (ret != 0)
+    {
+        return tMPI_Error(TMPI_COMM_WORLD, TMPI_ERR_IO);
+    }
+    ret = tMPI_Thread_cond_init( &(retc->comm_create_prep) );
+    if (ret != 0)
+    {
+        return tMPI_Error(TMPI_COMM_WORLD, TMPI_ERR_IO);
+    }
+    ret = tMPI_Thread_cond_init( &(retc->comm_create_finish) );
+    if (ret != 0)
+    {
+        return tMPI_Error(TMPI_COMM_WORLD, TMPI_ERR_IO);
+    }
 
-    ret->split    = NULL;
-    ret->new_comm = NULL;
+    retc->split    = NULL;
+    retc->new_comm = NULL;
     /* we have no topology to start out with */
-    ret->cart = NULL;
-    /*ret->graph=NULL;*/
+    retc->cart = NULL;
+    /*retc->graph=NULL;*/
 
     /* we start counting at 0 */
-    tMPI_Atomic_set( &(ret->destroy_counter), 0);
+    tMPI_Atomic_set( &(retc->destroy_counter), 0);
 
     /* initialize the main barrier */
-    tMPI_Barrier_init(&(ret->barrier), N);
+    tMPI_Barrier_init(&(retc->barrier), N);
 
     /* the reduce barriers */
     {
@@ -189,11 +211,19 @@ tMPI_Comm tMPI_Comm_alloc(tMPI_Comm parent, int N)
             Niter += 1;
         }
 
-        ret->N_reduce_iter = Niter;
+        retc->N_reduce_iter = Niter;
         /* allocate the list */
-        ret->reduce_barrier = (tMPI_Barrier_t**)
+        retc->reduce_barrier = (tMPI_Barrier_t**)
             tMPI_Malloc(sizeof(tMPI_Barrier_t*)*(Niter+1));
-        ret->N_reduce = (int*)tMPI_Malloc(sizeof(int)*(Niter+1));
+        if (retc->reduce_barrier == NULL)
+        {
+            return TMPI_ERR_NO_MEM;
+        }
+        retc->N_reduce = (int*)tMPI_Malloc(sizeof(int)*(Niter+1));
+        if (retc->N_reduce == NULL)
+        {
+            return TMPI_ERR_NO_MEM;
+        }
 
         /* we re-set Nred to N */
         Nred = N;
@@ -201,68 +231,108 @@ tMPI_Comm tMPI_Comm_alloc(tMPI_Comm parent, int N)
         {
             int j;
 
-            Nred             = Nred/2 + Nred%2;
-            ret->N_reduce[i] = Nred;
+            Nred              = Nred/2 + Nred%2;
+            retc->N_reduce[i] = Nred;
             /* allocate the sub-list */
-            ret->reduce_barrier[i] = (tMPI_Barrier_t*)
+            retc->reduce_barrier[i] = (tMPI_Barrier_t*)
                 tMPI_Malloc(sizeof(tMPI_Barrier_t)*(Nred));
+            if (retc->reduce_barrier[i] == NULL)
+            {
+                return TMPI_ERR_NO_MEM;
+            }
             for (j = 0; j < Nred; j++)
             {
-                tMPI_Barrier_init(&(ret->reduce_barrier[i][j]), 2);
+                tMPI_Barrier_init(&(retc->reduce_barrier[i][j]), 2);
             }
         }
     }
 
     /* the reduce buffers */
-    ret->reduce_sendbuf = (tMPI_Atomic_ptr_t*)
+    retc->reduce_sendbuf = (tMPI_Atomic_ptr_t*)
         tMPI_Malloc(sizeof(tMPI_Atomic_ptr_t)*Nthreads);
-    ret->reduce_recvbuf = (tMPI_Atomic_ptr_t*)
+    if (retc->reduce_sendbuf == NULL)
+    {
+        return TMPI_ERR_NO_MEM;
+    }
+    retc->reduce_recvbuf = (tMPI_Atomic_ptr_t*)
         tMPI_Malloc(sizeof(tMPI_Atomic_ptr_t)*Nthreads);
-
+    if (retc->reduce_recvbuf == NULL)
+    {
+        return TMPI_ERR_NO_MEM;
+    }
 
     if (parent)
     {
-        ret->erh = parent->erh;
+        retc->erh = parent->erh;
     }
     else
     {
-        ret->erh = TMPI_ERRORS_ARE_FATAL;
+        retc->erh = TMPI_ERRORS_ARE_FATAL;
     }
 
     /* coll_env objects */
-    ret->cev = (struct coll_env*)tMPI_Malloc(sizeof(struct coll_env)*N_COLL_ENV);
+    retc->cev = (struct coll_env*)tMPI_Malloc(sizeof(struct coll_env)*
+                                              N_COLL_ENV);
+    if (retc->cev == NULL)
+    {
+        return TMPI_ERR_NO_MEM;
+    }
+
     for (i = 0; i < N_COLL_ENV; i++)
     {
-        tMPI_Coll_env_init( &(ret->cev[i]), N);
+        ret = tMPI_Coll_env_init( &(retc->cev[i]), N);
+        if (ret != TMPI_SUCCESS)
+        {
+            return ret;
+        }
     }
     /* multi_sync objects */
-    ret->csync = (struct coll_sync*)tMPI_Malloc(sizeof(struct coll_sync)*N);
+    retc->csync = (struct coll_sync*)tMPI_Malloc(sizeof(struct coll_sync)*N);
+    if (retc->csync == NULL)
+    {
+        return TMPI_ERR_NO_MEM;
+    }
+
     for (i = 0; i < N; i++)
     {
-        tMPI_Coll_sync_init( &(ret->csync[i]), N);
+        ret = tMPI_Coll_sync_init( &(retc->csync[i]), N);
+        if (ret != TMPI_SUCCESS)
+        {
+            return ret;
+        }
     }
 
-    tMPI_Thread_mutex_lock( &(tmpi_global->comm_link_lock) );
+    ret = tMPI_Thread_mutex_lock( &(tmpi_global->comm_link_lock) );
+    if (ret != 0)
+    {
+        return tMPI_Error(TMPI_COMM_WORLD, TMPI_ERR_IO);
+    }
     /* we insert ourselves in the circular list, after TMPI_COMM_WORLD */
     if (TMPI_COMM_WORLD)
     {
-        ret->next = TMPI_COMM_WORLD;
-        ret->prev = TMPI_COMM_WORLD->prev;
+        retc->next = TMPI_COMM_WORLD;
+        retc->prev = TMPI_COMM_WORLD->prev;
 
-        TMPI_COMM_WORLD->prev->next = ret;
-        TMPI_COMM_WORLD->prev       = ret;
+        TMPI_COMM_WORLD->prev->next = retc;
+        TMPI_COMM_WORLD->prev       = retc;
     }
     else
     {
-        ret->prev = ret->next = ret;
+        retc->prev = retc->next = retc;
+    }
+    ret = tMPI_Thread_mutex_unlock( &(tmpi_global->comm_link_lock) );
+    if (ret != 0)
+    {
+        return tMPI_Error(TMPI_COMM_WORLD, TMPI_ERR_IO);
     }
-    tMPI_Thread_mutex_unlock( &(tmpi_global->comm_link_lock) );
-    return ret;
+    *newcomm = retc;
+    return TMPI_SUCCESS;
 }
 
-void tMPI_Comm_destroy(tMPI_Comm comm, tmpi_bool do_link_lock)
+int tMPI_Comm_destroy(tMPI_Comm comm, tmpi_bool do_link_lock)
 {
     int i;
+    int ret;
 
     free(comm->grp.peers);
     for (i = 0; i < comm->N_reduce_iter; i++)
@@ -283,9 +353,21 @@ void tMPI_Comm_destroy(tMPI_Comm comm, tmpi_bool do_link_lock)
     free(comm->cev);
     free(comm->csync);
 
-    tMPI_Thread_mutex_destroy( &(comm->comm_create_lock) );
-    tMPI_Thread_cond_destroy( &(comm->comm_create_prep) );
-    tMPI_Thread_cond_destroy( &(comm->comm_create_finish) );
+    ret = tMPI_Thread_mutex_destroy( &(comm->comm_create_lock) );
+    if (ret != 0)
+    {
+        return tMPI_Error(TMPI_COMM_WORLD, TMPI_ERR_IO);
+    }
+    ret = tMPI_Thread_cond_destroy( &(comm->comm_create_prep) );
+    if (ret != 0)
+    {
+        return tMPI_Error(TMPI_COMM_WORLD, TMPI_ERR_IO);
+    }
+    ret = tMPI_Thread_cond_destroy( &(comm->comm_create_finish) );
+    if (ret != 0)
+    {
+        return tMPI_Error(TMPI_COMM_WORLD, TMPI_ERR_IO);
+    }
 
     free((void*)comm->reduce_sendbuf);
     free((void*)comm->reduce_recvbuf);
@@ -299,7 +381,11 @@ void tMPI_Comm_destroy(tMPI_Comm comm, tmpi_bool do_link_lock)
     /* remove ourselves from the circular list */
     if (do_link_lock)
     {
-        tMPI_Thread_mutex_lock( &(tmpi_global->comm_link_lock) );
+        ret = tMPI_Thread_mutex_lock( &(tmpi_global->comm_link_lock) );
+        if (ret != 0)
+        {
+            return tMPI_Error(TMPI_COMM_WORLD, TMPI_ERR_IO);
+        }
     }
     if (comm->next)
     {
@@ -312,14 +398,20 @@ void tMPI_Comm_destroy(tMPI_Comm comm, tmpi_bool do_link_lock)
     free(comm);
     if (do_link_lock)
     {
-        tMPI_Thread_mutex_unlock( &(tmpi_global->comm_link_lock) );
+        ret = tMPI_Thread_mutex_unlock( &(tmpi_global->comm_link_lock) );
+        if (ret != 0)
+        {
+            return tMPI_Error(TMPI_COMM_WORLD, TMPI_ERR_IO);
+        }
     }
+    return TMPI_SUCCESS;
 }
 
 int tMPI_Comm_free(tMPI_Comm *comm)
 {
     int size;
     int sum;
+    int ret;
 #ifdef TMPI_TRACE
     tMPI_Trace_print("tMPI_Comm_free(%p)", comm);
 #endif
@@ -332,15 +424,27 @@ int tMPI_Comm_free(tMPI_Comm *comm)
     if ((*comm)->grp.N > 1)
     {
         /* we remove ourselves from the comm. */
-        tMPI_Thread_mutex_lock(&((*comm)->comm_create_lock));
+        ret = tMPI_Thread_mutex_lock(&((*comm)->comm_create_lock));
+        if (ret != 0)
+        {
+            return tMPI_Error(TMPI_COMM_WORLD, TMPI_ERR_IO);
+        }
         (*comm)->grp.peers[myrank] = (*comm)->grp.peers[(*comm)->grp.N-1];
         (*comm)->grp.N--;
-        tMPI_Thread_mutex_unlock(&((*comm)->comm_create_lock));
+        ret = tMPI_Thread_mutex_unlock(&((*comm)->comm_create_lock));
+        if (ret != 0)
+        {
+            return tMPI_Error(TMPI_COMM_WORLD, TMPI_ERR_IO);
+        }
     }
     else
     {
         /* we're the last one so we can safely destroy it */
-        tMPI_Comm_destroy(*comm, TRUE);
+        ret = tMPI_Comm_destroy(*comm, TRUE);
+        if (ret != 0)
+        {
+            return ret;
+        }
     }
 #else
     /* This is correct if programs actually treat Comm_free as a collective
@@ -354,12 +458,16 @@ int tMPI_Comm_free(tMPI_Comm *comm)
 
     /* we add 1 to the destroy counter and actually deallocate if the counter
        reaches N. */
-    sum = tMPI_Atomic_add_return( &((*comm)->destroy_counter), 1);
+    sum = tMPI_Atomic_fetch_add( &((*comm)->destroy_counter), 1) + 1;
     /* this is a collective call on a shared data structure, so only
        one process (the last one in this case) should do anything */
     if (sum == size)
     {
-        tMPI_Comm_destroy(*comm, TRUE);
+        ret = tMPI_Comm_destroy(*comm, TRUE);
+        if (ret != 0)
+        {
+            return ret;
+        }
     }
 #endif
     return TMPI_SUCCESS;
@@ -456,6 +564,7 @@ int tMPI_Comm_split(tMPI_Comm comm, int color, int key, tMPI_Comm *newcomm)
     tmpi_bool          i_am_first = FALSE;
     int                myrank     = tMPI_Comm_seek_rank(comm, tMPI_Get_current());
     struct tmpi_split *spl;
+    int                ret;
 
 #ifdef TMPI_TRACE
     tMPI_Trace_print("tMPI_Comm_split(%p, %d, %d, %p)", comm, color, key,
@@ -467,7 +576,11 @@ int tMPI_Comm_split(tMPI_Comm comm, int color, int key, tMPI_Comm *newcomm)
         return tMPI_Error(TMPI_COMM_WORLD, TMPI_ERR_COMM);
     }
 
-    tMPI_Thread_mutex_lock(&(comm->comm_create_lock));
+    ret = tMPI_Thread_mutex_lock(&(comm->comm_create_lock));
+    if (ret != 0)
+    {
+        return tMPI_Error(TMPI_COMM_WORLD, TMPI_ERR_IO);
+    }
     /* first get the colors */
     if (!comm->new_comm)
     {
@@ -498,7 +611,11 @@ int tMPI_Comm_split(tMPI_Comm comm, int color, int key, tMPI_Comm *newcomm)
 
     if (spl->Ncol_init == 0)
     {
-        tMPI_Thread_cond_signal(&(comm->comm_create_prep));
+        ret = tMPI_Thread_cond_signal(&(comm->comm_create_prep));
+        if (ret != 0)
+        {
+            return tMPI_Error(TMPI_COMM_WORLD, TMPI_ERR_IO);
+        }
     }
 
     if (!i_am_first)
@@ -507,8 +624,12 @@ int tMPI_Comm_split(tMPI_Comm comm, int color, int key, tMPI_Comm *newcomm)
            finished */
         while (!spl->can_finish)
         {
-            tMPI_Thread_cond_wait(&(comm->comm_create_finish),
-                                  &(comm->comm_create_lock) );
+            ret = tMPI_Thread_cond_wait(&(comm->comm_create_finish),
+                                        &(comm->comm_create_lock) );
+            if (ret != 0)
+            {
+                return tMPI_Error(TMPI_COMM_WORLD, TMPI_ERR_IO);
+            }
         }
     }
     else
@@ -526,8 +647,12 @@ int tMPI_Comm_split(tMPI_Comm comm, int color, int key, tMPI_Comm *newcomm)
         /*if (N>1)*/
         while (spl->Ncol_init > 0)
         {
-            tMPI_Thread_cond_wait(&(comm->comm_create_prep),
-                                  &(comm->comm_create_lock));
+            ret = tMPI_Thread_cond_wait(&(comm->comm_create_prep),
+                                        &(comm->comm_create_lock));
+            if (ret != 0)
+            {
+                return tMPI_Error(TMPI_COMM_WORLD, TMPI_ERR_IO);
+            }
         }
 
         /* reset the state so that a new comm creating function can run */
@@ -553,7 +678,11 @@ int tMPI_Comm_split(tMPI_Comm comm, int color, int key, tMPI_Comm *newcomm)
         comms = (tMPI_Comm*)tMPI_Malloc(Ncomms*sizeof(tMPI_Comm));
         for (i = 0; i < Ncomms; i++)
         {
-            comms[i] = tMPI_Comm_alloc(comm, comm_N[i]);
+            ret = tMPI_Comm_alloc(&(comms[i]), comm, comm_N[i]);
+            if (ret != TMPI_SUCCESS)
+            {
+                return ret;
+            }
         }
 
         /* now distribute the comms */
@@ -616,7 +745,11 @@ int tMPI_Comm_split(tMPI_Comm comm, int color, int key, tMPI_Comm *newcomm)
         spl->can_finish = TRUE;
 
         /* tell the waiting threads that there's a comm ready */
-        tMPI_Thread_cond_broadcast(&(comm->comm_create_finish));
+        ret = tMPI_Thread_cond_broadcast(&(comm->comm_create_finish));
+        if (ret != 0)
+        {
+            return tMPI_Error(TMPI_COMM_WORLD, TMPI_ERR_IO);
+        }
     }
     /* here the individual threads get their comm object */
     *newcomm = newcomm_list[myrank];
@@ -629,7 +762,11 @@ int tMPI_Comm_split(tMPI_Comm comm, int color, int key, tMPI_Comm *newcomm)
         free(spl);
     }
 
-    tMPI_Thread_mutex_unlock(&(comm->comm_create_lock));
+    ret = tMPI_Thread_mutex_unlock(&(comm->comm_create_lock));
+    if (ret != 0)
+    {
+        return tMPI_Error(TMPI_COMM_WORLD, TMPI_ERR_IO);
+    }
 
     return TMPI_SUCCESS;
 }
index fe8d444c7000cb668139e1ceff99a1364e688d19..60f77420fb0d6080a123ddd9432b91be08d7752d 100644 (file)
@@ -77,6 +77,7 @@ static const char *tmpi_errmsg[] =
 {
     "No error",
     "malloc failure in tMPI (out of memory)",
+    "I/O or system error",
     "tMPI Initialization error",
     "tMPI Finalize error",
     "Invalid tMPI_Group",
@@ -96,6 +97,10 @@ static const char *tmpi_errmsg[] =
     "Invalid reduce operator",
     "Out of receive envelopes: this shouldn't happen (probably a bug).",
     "Out of receive requests: this shouldn't happen (probably a bug).",
+    "Out of copy buffers: this shouldn't happen (probably a bug).",
+    "Copy buffer size too small: this shouldn't happen (probably a bug).",
+    "Error in MPI_Status",
+    "Error getting/setting processor layout/affinity",
     "Transmission failure",
     "Unknown tMPI error"
 };
@@ -125,11 +130,29 @@ int tMPI_Error_string(int errorcode, char *strn, size_t *resultlen)
         errorcode = TMPI_ERR_UNKNOWN;
     }
 
+    if (errorcode != TMPI_ERR_IO)
+    {
+#if !(defined( _WIN32 ) || defined( _WIN64 ) )
+        strncpy(strn, tmpi_errmsg[errorcode], TMPI_MAX_ERROR_STRING);
+#else
+        strncpy_s(strn, TMPI_MAX_ERROR_STRING, tmpi_errmsg[errorcode],
+                  TMPI_MAX_ERROR_STRING);
+#endif
+    }
+    else
+    {
 #if !(defined( _WIN32 ) || defined( _WIN64 ) )
-    strncpy(strn, tmpi_errmsg[errorcode], TMPI_MAX_ERROR_STRING);
+        snprintf(strn, TMPI_MAX_ERROR_STRING,
+                 "%s: %s", tmpi_errmsg[errorcode], strerror(errno));
 #else
-    strncpy_s(strn, TMPI_MAX_ERROR_STRING, tmpi_errmsg[errorcode], TMPI_MAX_ERROR_STRING);
+        char buf[TMPI_MAX_ERROR_STRING];
+
+        strerror_s(buf, TMPI_MAX_ERROR_STRING-1, errno);
+        _snprintf_s(strn, TMPI_MAX_ERROR_STRING, _TRUNCATE,
+                    "%s: %s", tmpi_errmsg[errorcode], buf);
 #endif
+
+    }
     *resultlen = strlen(strn);
     return TMPI_SUCCESS;
 }
@@ -147,7 +170,7 @@ int tMPI_Create_errhandler(tMPI_Errhandler_fn *function,
     if (!*errhandler)
     {
         fprintf(stderr, "tMPI fatal error (%s), bailing out\n",
-                tmpi_errmsg[TMPI_ERR_MALLOC]);
+                tmpi_errmsg[TMPI_ERR_NO_MEM]);
         abort();
     }
     (*errhandler)->err = 0;
index a43f2db53db039b68cedfe22f1d61bbd89124d38..54e91bf69b64430ad336ed53cd82d0eba9b529c8 100644 (file)
@@ -145,8 +145,12 @@ int tMPI_Gather(void* sendbuf, int sendcount, tMPI_Datatype sendtype,
         }
 
         /* first set up the data just to root. */
-        tMPI_Post_multi(cev, myrank, 0, TMPI_GATHER_TAG, sendtype,
-                        sendcount*sendtype->size, sendbuf, 1, synct, root);
+        ret = tMPI_Post_multi(cev, myrank, 0, TMPI_GATHER_TAG, sendtype,
+                              sendcount*sendtype->size, sendbuf, 1, synct, root);
+        if (ret != TMPI_SUCCESS)
+        {
+            return ret;
+        }
         /* and wait until root is done copying */
         tMPI_Wait_for_others(cev, myrank);
     }
@@ -247,8 +251,12 @@ int tMPI_Gatherv(void* sendbuf, int sendcount, tMPI_Datatype sendtype,
         }
 
         /* first set up the data just to root. */
-        tMPI_Post_multi(cev, myrank, 0, TMPI_GATHERV_TAG, sendtype,
-                        sendcount*sendtype->size, sendbuf, 1, synct, root);
+        ret = tMPI_Post_multi(cev, myrank, 0, TMPI_GATHERV_TAG, sendtype,
+                              sendcount*sendtype->size, sendbuf, 1, synct, root);
+        if (ret != TMPI_SUCCESS)
+        {
+            return ret;
+        }
         /* and wait until root is done copying */
         tMPI_Wait_for_others(cev, myrank);
     }
index 17b4b46661bc195f02f590555b43b668d1ecfdb9..94d0646302fe21edab3c4b72ead5f47e315da608 100644 (file)
@@ -488,7 +488,7 @@ struct tmpi_global
     int                     Nalloc_usertypes;
 
     /* spinlock/mutex for manipulating tmpi_user_types */
-    tMPI_Spinlock_t  datatype_lock;
+    tMPI_Spinlock_t datatype_lock;
 
     /* Lock to prevent multiple threads manipulating the linked list of comm
        structures.*/
@@ -739,9 +739,9 @@ int tMPI_Comm_seek_rank(tMPI_Comm comm, struct tmpi_thread *th);
 int tMPI_Comm_N(tMPI_Comm comm);
 
 /* allocate a comm object, making space for N threads */
-tMPI_Comm tMPI_Comm_alloc(tMPI_Comm parent, int N);
+int tMPI_Comm_alloc(tMPI_Comm *newcomm, tMPI_Comm parent, int N);
 /* de-allocate a comm object */
-void tMPI_Comm_destroy(tMPI_Comm comm, tmpi_bool do_link_lock);
+int tMPI_Comm_destroy(tMPI_Comm comm, tmpi_bool do_link_lock);
 /* allocate a group object */
 tMPI_Group tMPI_Group_alloc(void);
 
@@ -756,13 +756,13 @@ void tMPI_Cart_destroy(struct cart_topol *top);
 
 
 /* initialize a free envelope list with N envelopes */
-void tMPI_Free_env_list_init(struct free_envelope_list *evl, int N);
+int tMPI_Free_env_list_init(struct free_envelope_list *evl, int N);
 /* destroy a free envelope list */
 void tMPI_Free_env_list_destroy(struct free_envelope_list *evl);
 
 
 /* initialize a send envelope list */
-void tMPI_Send_env_list_init(struct send_envelope_list *evl, int N);
+int tMPI_Send_env_list_init(struct send_envelope_list *evl, int N);
 /* destroy a send envelope list */
 void tMPI_Send_env_list_destroy(struct send_envelope_list *evl);
 
@@ -772,7 +772,7 @@ void tMPI_Send_env_list_destroy(struct send_envelope_list *evl);
 
 
 /* initialize a recv envelope list */
-void tMPI_Recv_env_list_init(struct recv_envelope_list *evl);
+int tMPI_Recv_env_list_init(struct recv_envelope_list *evl);
 /* destroy a recv envelope list */
 void tMPI_Recv_env_list_destroy(struct recv_envelope_list *evl);
 
@@ -780,7 +780,7 @@ void tMPI_Recv_env_list_destroy(struct recv_envelope_list *evl);
 
 
 /* initialize request list */
-void tMPI_Req_list_init(struct req_list *rl, int N_reqs);
+int tMPI_Req_list_init(struct req_list *rl, int N_reqs);
 /* destroy request list */
 void tMPI_Req_list_destroy(struct req_list *rl);
 
@@ -790,19 +790,19 @@ void tMPI_Req_list_destroy(struct req_list *rl);
 
 
 /* initialize a coll env structure */
-void tMPI_Coll_env_init(struct coll_env *mev, int N);
+int tMPI_Coll_env_init(struct coll_env *mev, int N);
 /* destroy a coll env structure */
 void tMPI_Coll_env_destroy(struct coll_env *mev);
 
 /* initialize a coll sync structure */
-void tMPI_Coll_sync_init(struct coll_sync *msc, int N);
+int tMPI_Coll_sync_init(struct coll_sync *msc, int N);
 /* destroy a coll sync structure */
 void tMPI_Coll_sync_destroy(struct coll_sync *msc);
 
 #ifdef USE_COLLECTIVE_COPY_BUFFER
 /* initialize a copy_buffer_list */
-void tMPI_Copy_buffer_list_init(struct copy_buffer_list *cbl, int Nbufs,
-                                size_t size);
+int tMPI_Copy_buffer_list_init(struct copy_buffer_list *cbl, int Nbufs,
+                               size_t size);
 /* initialize a copy_buffer_list */
 void tMPI_Copy_buffer_list_destroy(struct copy_buffer_list *cbl);
 /* get a copy buffer from a list */
@@ -811,7 +811,7 @@ struct copy_buffer *tMPI_Copy_buffer_list_get(struct copy_buffer_list *cbl);
 void tMPI_Copy_buffer_list_return(struct copy_buffer_list *cbl,
                                   struct copy_buffer      *cb);
 /* initialize a copy buffer */
-void tMPI_Copy_buffer_init(struct copy_buffer *cb, size_t size);
+int tMPI_Copy_buffer_init(struct copy_buffer *cb, size_t size);
 void tMPI_Copy_buffer_destroy(struct copy_buffer *cb);
 #endif
 
index 3fd19bd670dc44f3878a567644f92b30ab83c4b5..a0c51674f641cc28a053e6d5376ef46c9436cc99 100644 (file)
@@ -83,7 +83,7 @@ int tMPI_Lock_trylock(tMPI_Lock_t *lock)
     return tMPI_Spinlock_trylock(&(lock->lock));
 }
 
-int tMPI_Lock_islocked(const tMPI_Lock_t *lock)
+int tMPI_Lock_islocked(tMPI_Lock_t *lock)
 {
     return tMPI_Spinlock_islocked(&(lock->lock));
 }
index 5c659d095f64ab1ba555b0d711356a973172b7a1..be248ef25a98c0acc2b7da391edeb5e9900b9e64 100644 (file)
@@ -141,7 +141,7 @@ void* tMPI_Once_wait(tMPI_Comm comm, void* (*function)(void*), void *param,
 
         tMPI_Atomic_memory_barrier_rel();
         /* signal that we're done */
-        tMPI_Atomic_add_return(&(cev->coll.current_sync), 1);
+        tMPI_Atomic_fetch_add(&(cev->coll.current_sync), 1);
         /* we need to keep being in sync */
         csync->syncs++;
     }
index c1d565cf81aa51ae1e5b3fc3b3cde5ffa3106b6d..4851b204d98b707f61a3fd8eede90e91ac3dd10e 100644 (file)
@@ -122,13 +122,17 @@ static void tMPI_Xfer(struct tmpi_thread *cur, struct envelope *sev,
 
 
 /* Point-to-point communication protocol functions */
-void tMPI_Free_env_list_init(struct free_envelope_list *evl, int N)
+int tMPI_Free_env_list_init(struct free_envelope_list *evl, int N)
 {
     int i;
 
     /* allocate the head element */
     evl->recv_alloc_head = (struct envelope*)tMPI_Malloc(sizeof(struct envelope)
                                                          *N);
+    if (evl->recv_alloc_head == NULL)
+    {
+        return TMPI_ERR_NO_MEM;
+    }
     evl->head_recv = evl->recv_alloc_head;
 
     for (i = 0; i < N; i++)
@@ -144,6 +148,7 @@ void tMPI_Free_env_list_init(struct free_envelope_list *evl, int N)
         evl->head_recv[i].rlist = NULL;
         evl->head_recv[i].slist = NULL;
     }
+    return TMPI_SUCCESS;
 }
 
 void tMPI_Free_env_list_destroy(struct free_envelope_list *evl)
@@ -159,9 +164,8 @@ static struct envelope* tMPI_Free_env_list_fetch_recv(struct
     struct envelope *ret;
     if (!evl->head_recv)
     {
-        /* TODO: make this do something better than crash */
-        fprintf(stderr, "Ran out of recv envelopes!!!!\n");
-        abort();
+        tMPI_Error(TMPI_COMM_WORLD, TMPI_ERR_ENVELOPES);
+        return NULL;
     }
 
     ret            = evl->head_recv;
@@ -194,7 +198,7 @@ static void tMPI_Free_env_list_return_recv(struct free_envelope_list *evl,
 
 /* tmpi_send_envelope_list functions */
 
-void tMPI_Send_env_list_init(struct send_envelope_list *evl, int N)
+int tMPI_Send_env_list_init(struct send_envelope_list *evl, int N)
 {
     int i;
 #ifndef TMPI_LOCK_FREE_LISTS
@@ -204,6 +208,10 @@ void tMPI_Send_env_list_init(struct send_envelope_list *evl, int N)
     evl->Nalloc = N;
 
     evl->alloc_head = (struct envelope*)tMPI_Malloc(sizeof(struct envelope)*N);
+    if (evl->alloc_head == NULL)
+    {
+        return TMPI_ERR_NO_MEM;
+    }
     for (i = 0; i < N; i++)
     {
         evl->alloc_head[i].next  = (i < (N-1)) ? &(evl->alloc_head[i+1]) : NULL;
@@ -211,7 +219,12 @@ void tMPI_Send_env_list_init(struct send_envelope_list *evl, int N)
         evl->alloc_head[i].slist = evl;
         evl->alloc_head[i].rlist = NULL;
 #ifdef USE_SEND_RECV_COPY_BUFFER
-        evl->alloc_head[i].cb = (void*)tMPI_Malloc(sizeof(char)*COPY_BUFFER_SIZE);
+        evl->alloc_head[i].cb = (void*)tMPI_Malloc(sizeof(char)*
+                                                   COPY_BUFFER_SIZE);
+        if (evl->alloc_head[i].cb == NULL)
+        {
+            return TMPI_ERR_NO_MEM;
+        }
 #endif
     }
 
@@ -227,6 +240,7 @@ void tMPI_Send_env_list_init(struct send_envelope_list *evl, int N)
     evl->head_old       = evl->alloc_head; /* the first element is a dummy */
     evl->head_old->next = evl->head_old;
     evl->head_old->prev = evl->head_old;
+    return TMPI_SUCCESS;
 }
 
 void tMPI_Send_env_list_destroy(struct send_envelope_list *evl)
@@ -295,7 +309,10 @@ static struct envelope* tMPI_Send_env_list_fetch_new(struct
         {
             /* There are no free send envelopes, so all we can do is handle
                incoming requests until we get a free send envelope. */
+#if defined(TMPI_DEBUG)  || defined(TMPI_WARNINGS)
             printf("Ran out of send envelopes!!\n");
+            fflush(stdout);
+#endif
             tMPI_Wait_process_incoming(tMPI_Get_current());
         }
 #else
@@ -304,8 +321,8 @@ static struct envelope* tMPI_Send_env_list_fetch_new(struct
                calling program. We could fix the situation by waiting,
                but that would most likely lead to deadlocks - even
                more difficult to debug than this. */
-            fprintf(stderr, "Ran out of send envelopes!!!!\n");
-            abort();
+            tMPI_Error(TMPI_COMM_WORLD, TMPI_ERR_ENVELOPES);
+            return NULL;
         }
 #endif
     }
@@ -451,11 +468,13 @@ static void tMPI_Send_env_list_move_to_old(struct envelope *sev)
 
 /* tmpi_recv_envelope_list functions */
 
-void tMPI_Recv_env_list_init(struct recv_envelope_list *evl)
+int tMPI_Recv_env_list_init(struct recv_envelope_list *evl)
 {
     evl->head       = &(evl->dummy);
     evl->head->prev = evl->head;
     evl->head->next = evl->head;
+
+    return TMPI_SUCCESS;
 }
 
 void tMPI_Recv_env_list_destroy(struct recv_envelope_list *evl)
@@ -499,12 +518,16 @@ static void tMPI_Recv_env_list_remove(struct envelope *rev)
 
 /* tmpi_req functions */
 
-void tMPI_Req_list_init(struct req_list *rl, int N_reqs)
+int tMPI_Req_list_init(struct req_list *rl, int N_reqs)
 {
     int i;
 
     rl->alloc_head = (struct tmpi_req_*)tMPI_Malloc(
                 sizeof(struct tmpi_req_)*N_reqs);
+    if (rl->alloc_head == 0)
+    {
+        return TMPI_ERR_NO_MEM;
+    }
     rl->head = rl->alloc_head;
     for (i = 0; i < N_reqs; i++)
     {
@@ -526,6 +549,7 @@ void tMPI_Req_list_init(struct req_list *rl, int N_reqs)
             rl->head[i].next = &(rl->head[i+1]);
         }
     }
+    return TMPI_SUCCESS;
 }
 
 void tMPI_Req_list_destroy(struct req_list *rl)
@@ -650,8 +674,8 @@ tmpi_bool tMPI_Envelope_matches(const struct envelope *sev,
          ( (!rev->src)  || (rev->src == sev->src) ) &&
          ( sev->dest == rev->dest ) &&
          ( sev->datatype == rev->datatype ) &&
-         ( sev->state.value < env_finished  &&
-           rev->state.value == env_unmatched ) )
+         ( tMPI_Atomic_get(&(sev->state)) < env_finished  &&
+           tMPI_Atomic_get(&(rev->state)) == env_unmatched ) )
     {
 #ifdef TMPI_DEBUG
         printf("%5d: (%d->%d) tag=%d found match\n",
@@ -749,8 +773,12 @@ void tMPI_Send_copy_buffer(struct envelope *sev, struct tmpi_req_ *req)
        we first need to wait until the receiver is finished copying. We
        know this is a short wait (since the buffer was small enough to be
        buffered in the first place), so we just spin-wait.  */
+    tMPI_Atomic_memory_barrier(); /* a full barrier to make sure that the
+                                     sending doesn't interfere with the
+                                     waiting */
     while (tMPI_Atomic_get( &(sev->state) ) < env_cb_available)
     {
+        tMPI_Atomic_memory_barrier_acq();
     }
     tMPI_Atomic_memory_barrier_acq();
 #ifdef TMPI_DEBUG
@@ -776,6 +804,10 @@ struct envelope* tMPI_Prep_send_envelope(struct send_envelope_list *evl,
 {
     /* get an envelope from the send-envelope stack */
     struct envelope *ev = tMPI_Send_env_list_fetch_new( evl );
+    if (ev == NULL)
+    {
+        return NULL;
+    }
 
     ev->tag      = tag;
     ev->nonblock = nonblock;
@@ -821,6 +853,10 @@ struct envelope* tMPI_Prep_recv_envelope(struct tmpi_thread *cur,
 {
     /* get an envelope from the stack */
     struct envelope *ev = tMPI_Free_env_list_fetch_recv( &(cur->envelopes) );
+    if (ev == NULL)
+    {
+        return NULL;
+    }
 
     ev->tag      = tag;
     ev->nonblock = nonblock;
@@ -931,7 +967,7 @@ static void tMPI_Xfer(struct tmpi_thread *cur, struct envelope *sev,
     tMPI_Atomic_set( &(sev->state), env_finished);
 
     /* signal to a potentially waiting thread that we're done. */
-    tMPI_Atomic_add_return( &(rev->src->ev_outgoing_received), 1);
+    tMPI_Atomic_fetch_add( &(rev->src->ev_outgoing_received), 1);
     tMPI_Event_signal(&(rev->src->p2p_event));
 
     /* remove the receiving envelope if it's in a list */
@@ -978,6 +1014,10 @@ struct envelope* tMPI_Post_match_recv(struct tmpi_thread *cur,
     /* reserve an envelope to post */
     rev = tMPI_Prep_recv_envelope(cur, comm, src, dest, recv_buf, recv_count,
                                   datatype, tag, nonblock);
+    if (rev == NULL)
+    {
+        return NULL;
+    }
 
 #ifdef TMPI_DEBUG
     printf("%5d: tMPI_Post_match_recv (%d->%d, tag=%d) started\n",
@@ -1048,6 +1088,10 @@ struct envelope *tMPI_Post_send(struct tmpi_thread *cur,
     /* reserve an envelope to post */
     sev = tMPI_Prep_send_envelope(sevl, comm, src, dest, send_buf, send_count,
                                   datatype, tag, nonblock);
+    if (sev == NULL)
+    {
+        return NULL;
+    }
 
 #ifdef TMPI_DEBUG
     printf("%5d: tMPI_Post_send (%d->%d, tag=%d)\n",
@@ -1084,7 +1128,7 @@ void tMPI_Wait_process_incoming(struct tmpi_thread *cur)
     tMPI_Profile_wait_stop(cur, TMPIWAIT_P2p);
 #endif
     n_handled = tMPI_Atomic_get(&(cur->ev_outgoing_received));
-    tMPI_Atomic_add_return( &(cur->ev_outgoing_received), -n_handled);
+    tMPI_Atomic_fetch_add( &(cur->ev_outgoing_received), -n_handled);
     check_id -= n_handled;
 
     if (check_id > 0)
index b2a305cfee8324b8140fe161f4bdaa7aa28d0a3a..3d0379cac7fc6b328dd2f930ce69ef747d9e473a 100644 (file)
@@ -87,6 +87,10 @@ int tMPI_Send(void* buf, int count, tMPI_Datatype datatype, int dest,
     }
 
     sev = tMPI_Post_send(cur, comm, send_dst, buf, count, datatype, tag, FALSE);
+    if (sev == NULL)
+    {
+        return TMPI_ERR_ENVELOPES;
+    }
     tMPI_Req_init(&req, sev);
     tMPI_Wait_single(cur, &req);
 
@@ -130,6 +134,10 @@ int tMPI_Recv(void* buf, int count, tMPI_Datatype datatype, int source,
 
     rev = tMPI_Post_match_recv(cur, comm, recv_src, buf, count, datatype, tag,
                                FALSE);
+    if (rev == NULL)
+    {
+        return TMPI_ERR_ENVELOPES;
+    }
     tMPI_Req_init(&req, rev);
     tMPI_Wait_single(cur, &req);
 
@@ -185,10 +193,18 @@ int tMPI_Sendrecv(void *sendbuf, int sendcount, tMPI_Datatype sendtype,
     /* we first prepare to send */
     sev = tMPI_Post_send(cur, comm, send_dst, sendbuf, sendcount,
                          sendtype, sendtag, FALSE);
+    if (sev == NULL)
+    {
+        return TMPI_ERR_ENVELOPES;
+    }
     tMPI_Req_init(&sreq, sev);
     /* the we prepare to receive */
     rev = tMPI_Post_match_recv(cur, comm, recv_src, recvbuf, recvcount,
                                recvtype, recvtag, FALSE);
+    if (rev == NULL)
+    {
+        return TMPI_ERR_ENVELOPES;
+    }
     tMPI_Req_init(&rreq, rev);
 
     /* fix the pointers */
@@ -256,6 +272,10 @@ int tMPI_Isend(void* buf, int count, tMPI_Datatype datatype, int dest,
         return tMPI_Error(comm, TMPI_ERR_SEND_DEST);
     }
     ev = tMPI_Post_send(cur, comm, send_dst, buf, count, datatype, tag, TRUE);
+    if (ev == NULL)
+    {
+        return TMPI_ERR_ENVELOPES;
+    }
     tMPI_Req_init(rq, ev);
     *request = rq;
 
@@ -299,6 +319,10 @@ int tMPI_Irecv(void* buf, int count, tMPI_Datatype datatype, int source,
     }
     ev = tMPI_Post_match_recv(cur, comm, recv_src, buf, count, datatype, tag,
                               TRUE);
+    if (ev == NULL)
+    {
+        return TMPI_ERR_ENVELOPES;
+    }
     tMPI_Req_init(rq, ev);
     *request = rq;
 #ifdef TMPI_PROFILE
index a7de36be2c10a5d059f5c8bc19b519ca8574880f..c92e0fad25cdf00fc9385a988265705ebbe2ccbf 100644 (file)
@@ -120,7 +120,7 @@ const char *tmpi_waitfn_names[] =
    stage about empty object files */
 #ifdef TMPI_PROFILE
 
-void tMPI_Profile_init(struct tmpi_profile *prof)
+int tMPI_Profile_init(struct tmpi_profile *prof)
 {
     int i;
 
@@ -138,7 +138,7 @@ void tMPI_Profile_init(struct tmpi_profile *prof)
     {
         prof->wait_cycles[i] = 0;
     }
-    prof->global_start = tmpi_cycles_read();
+    prof->global_start = tMPI_Cycles_read();
     prof->global_stop  = 0;
     prof->wait_start   = 0;
 #endif
@@ -148,6 +148,8 @@ void tMPI_Profile_init(struct tmpi_profile *prof)
     prof->total_p2p_xfers     = 0;
     prof->total_coll_xfers    = 0;
     tMPI_Profile_started      = 1;
+
+    return TMPI_SUCCESS;
 }
 
 
@@ -162,7 +164,7 @@ void tMPI_Profile_destroy(struct tmpi_profile *prof)
 void tMPI_Profile_stop(struct tmpi_profile *prof)
 {
 #ifdef TMPI_CYCLE_COUNT
-    prof->global_stop = tmpi_cycles_read();
+    prof->global_stop = tMPI_Cycles_read();
 #endif
     tMPI_Profile_started = 0;
 }
index 3d763f76b5883f7af9be1b6c527f25b3c3da2a42..b4066df3959af7ac4b16e87eac209274b9426019 100644 (file)
@@ -107,17 +107,17 @@ struct tmpi_profile
 
 #ifdef TMPI_CYCLE_COUNT
     /* cycle counters */
-    tmpi_cycles_t mpifn_cycles[TMPIFN_Nfunctions]; /* array of cycle counters */
-    tmpi_cycles_t wait_cycles[TMPIWAIT_N];         /* the wait cycles */
+    tMPI_Cycles_t mpifn_cycles[TMPIFN_Nfunctions]; /* array of cycle counters */
+    tMPI_Cycles_t wait_cycles[TMPIWAIT_N];         /* the wait cycles */
 
-    tmpi_cycles_t global_start, global_stop;       /* timing start and stop times */
-    tmpi_cycles_t mpifn_start;                     /* individual timing start times for profiling
+    tMPI_Cycles_t global_start, global_stop;       /* timing start and stop times */
+    tMPI_Cycles_t mpifn_start;                     /* individual timing start times for profiling
                                                       function call times.  This can be here
                                                       because tmpi_profile is thread-specific. */
     enum tmpi_functions fn;                        /* the function being cycle-counted */
 
 
-    tmpi_cycles_t wait_start; /* individual timing start times for profiling
+    tMPI_Cycles_t wait_start; /* individual timing start times for profiling
                                  wait times. */
 
     double totals;            /* totals counter for reporting end results */
@@ -127,7 +127,7 @@ struct tmpi_profile
 extern int tMPI_Profile_started;
 
 /* initialize the profile counter */
-void tMPI_Profile_init(struct tmpi_profile *prof);
+int tMPI_Profile_init(struct tmpi_profile *prof);
 
 #if 0
 /* deallocations */
@@ -143,7 +143,7 @@ void tMPI_Profile_stop(struct tmpi_profile *prof);
 /* start */
 #ifdef TMPI_CYCLE_COUNT
 /*void tMPI_Profile_count_start(struct tmpi_thread *th);*/
-#define tMPI_Profile_count_start(th) { th->profile.mpifn_start = tmpi_cycles_read(); }
+#define tMPI_Profile_count_start(th) { th->profile.mpifn_start = tMPI_Cycles_read(); }
 #else
 #define tMPI_Profile_count_start(th) {}
 #endif
@@ -153,7 +153,7 @@ void tMPI_Profile_stop(struct tmpi_profile *prof);
 #ifdef TMPI_CYCLE_COUNT
 #define tMPI_Profile_count_stop(th, fn) \
     { \
-        tmpi_cycles_t stop = tmpi_cycles_read(); \
+        tMPI_Cycles_t stop = tMPI_Cycles_read(); \
         th->profile.mpifn_cycles[fn] += (stop - th->profile.mpifn_start); \
         (th->profile.mpifn_calls[fn])++; \
     }
@@ -176,7 +176,7 @@ void tMPI_Profile_stop(struct tmpi_profile *prof);
 /*void tMPI_Profile_wait_start(struct tmpi_thread *th);*/
 #define tMPI_Profile_wait_start(th) \
     { \
-        th->profile.wait_start = tmpi_cycles_read(); \
+        th->profile.wait_start = tMPI_Cycles_read(); \
     }
 
 /* stop waiting cycle count */
@@ -184,7 +184,7 @@ void tMPI_Profile_stop(struct tmpi_profile *prof);
                             enum tmpi_wait_functions fn);*/
 #define tMPI_Profile_wait_stop(th, fn) \
     { \
-        tmpi_cycles_t wait_stop = tmpi_cycles_read(); \
+        tMPI_Cycles_t wait_stop = tMPI_Cycles_read(); \
         th->profile.wait_cycles[fn] += (wait_stop - th->profile.wait_start); \
     }
 #else
index 948ae78aa72b3de39b3c18ad28579e89323df6b3..85c25c99620335747b86935fee028f08b1d9cfea 100644 (file)
@@ -94,20 +94,6 @@ static int             thread_id_key_initialized = 0;
 
 
 
-/* TODO: this needs to go away!  (there's another one in winthreads.c)
-   fatal errors are thankfully really rare*/
-void tMPI_Fatal_error(const char *file, int line, const char *message, ...)
-{
-    va_list ap;
-
-    fprintf(stderr, "tMPI Fatal error in %s, line %d: ", file, line);
-    va_start(ap, message);
-    vfprintf(stderr, message, ap);
-    va_end(ap);
-    fprintf(stderr, "\n");
-    abort();
-}
-
 
 enum tMPI_Thread_support tMPI_Thread_support(void)
 {
@@ -146,22 +132,46 @@ static void tMPI_Destroy_thread_id(void* thread_id)
 }
 
 /* initialize the thread id vars if not already initialized */
-static void tMPI_Init_thread_ids(void)
+static int tMPI_Init_thread_ids(void)
 {
-    pthread_mutex_lock( &thread_id_mutex );
+    int ret;
+    ret = pthread_mutex_lock( &thread_id_mutex );
+    if (ret != 0)
+    {
+        return ret;
+    }
+
     if (!thread_id_key_initialized)
     {
         /* initialize and set the thread id thread-specific variable */
         struct tMPI_Thread *main_thread;
 
         thread_id_key_initialized = 1;
-        pthread_key_create(&thread_id_key, tMPI_Destroy_thread_id);
-        main_thread                  = (struct tMPI_Thread*)malloc(sizeof(struct tMPI_Thread)*1);
+        ret = pthread_key_create(&thread_id_key, tMPI_Destroy_thread_id);
+        if (ret != 0)
+        {
+            goto err;
+        }
+        main_thread = (struct tMPI_Thread*)malloc(sizeof(struct tMPI_Thread)*1);
+        if (main_thread == NULL)
+        {
+            ret = ENOMEM;
+            goto err;
+        }
         main_thread->th              = pthread_self();
         main_thread->started_by_tmpi = 0;
-        pthread_setspecific(thread_id_key, main_thread);
+        ret = pthread_setspecific(thread_id_key, main_thread);
+        if (ret != 0)
+        {
+            goto err;
+        }
     }
+
+    ret = pthread_mutex_unlock( &thread_id_mutex );
+    return ret;
+err:
     pthread_mutex_unlock( &thread_id_mutex );
+    return ret;
 }
 
 /* structure to hold the arguments for the thread_starter function */
@@ -178,8 +188,13 @@ static void *tMPI_Thread_starter(void *arg)
     struct tMPI_Thread_starter *starter = (struct tMPI_Thread_starter *)arg;
     void *(*start_routine)(void*);
     void *parg;
+    int   ret;
 
-    pthread_setspecific(thread_id_key, starter->thread);
+    ret = pthread_setspecific(thread_id_key, starter->thread);
+    if (ret != 0)
+    {
+        return NULL;
+    }
     start_routine = starter->start_routine;
     parg          = starter->arg;
 
@@ -195,34 +210,31 @@ int tMPI_Thread_create(tMPI_Thread_t *thread, void *(*start_routine)(void *),
 
     if (thread == NULL)
     {
-        tMPI_Fatal_error(TMPI_FARGS, "Invalid thread pointer.");
         return EINVAL;
     }
     tMPI_Init_thread_ids();
 
-    *thread                    = (struct tMPI_Thread*)malloc(sizeof(struct tMPI_Thread)*1);
+    *thread = (struct tMPI_Thread*)malloc(sizeof(struct tMPI_Thread)*1);
+    if (*thread == NULL)
+    {
+        return ENOMEM;
+    }
     (*thread)->started_by_tmpi = 1;
     starter                    = (struct tMPI_Thread_starter*)
         malloc(sizeof(struct tMPI_Thread_starter)*1);
+    if (starter == NULL)
+    {
+        return ENOMEM;
+    }
     /* fill the starter structure */
     starter->thread        = *thread;
     starter->start_routine = start_routine;
     starter->arg           = arg;
 
-    /*ret=pthread_create(&((*thread)->th),NULL,start_routine,arg);*/
     ret = pthread_create(&((*thread)->th), NULL, tMPI_Thread_starter,
                          (void*)starter);
 
-    if (ret != 0)
-    {
-        /* Cannot use tMPI_error() since messages use threads for locking */
-        tMPI_Fatal_error(TMPI_FARGS, "Failed to create POSIX thread:%s, rc=%d",
-                         strerror(errno), ret);
-        /* Use system memory allocation routines */
-        return -1;
-    }
-
-    return 0;
+    return ret;
 }
 
 
@@ -232,23 +244,27 @@ int tMPI_Thread_join(tMPI_Thread_t thread, void **value_ptr)
     int       ret;
     pthread_t th = thread->th;
 
-
     ret = pthread_join( th, value_ptr );
-
-    free(thread);
     if (ret != 0)
     {
-        tMPI_Fatal_error(TMPI_FARGS, "Failed to join POSIX thread. rc=%d", ret);
+        return ret;
     }
-    return ret;
+    free(thread);
+    return 0;
 }
 
 
 tMPI_Thread_t tMPI_Thread_self(void)
 {
     tMPI_Thread_t th;
+    int           ret;
+
     /* make sure the key var is set */
-    tMPI_Init_thread_ids();
+    ret = tMPI_Init_thread_ids();
+    if (ret != 0)
+    {
+        return NULL;
+    }
 
     th = pthread_getspecific(thread_id_key);
 
@@ -256,9 +272,15 @@ tMPI_Thread_t tMPI_Thread_self(void)
     if (th == NULL)
     {
         /* if not, create an ID, set it and return it */
-        th                  = (struct tMPI_Thread*)malloc(sizeof(struct tMPI_Thread)*1);
-        th->started_by_tmpi = 0;
+        th = (struct tMPI_Thread*)malloc(sizeof(struct tMPI_Thread)*1);
+        if (th == NULL)
+        {
+            return NULL;
+        }
         th->th              = pthread_self();
+        th->started_by_tmpi = 0;
+        /* we ignore errors here because they're not important -
+           the next iteration will do the same thing. */
         pthread_setspecific(thread_id_key, th);
     }
     return th;
@@ -323,32 +345,60 @@ int tMPI_Thread_mutex_init(tMPI_Thread_mutex_t *mtx)
         return EINVAL;
     }
 
-    mtx->mutex = (struct tMPI_Mutex*)tMPI_Malloc(sizeof(struct tMPI_Mutex)*1);
-    ret        = pthread_mutex_init(&(mtx->mutex->mtx), NULL);
-
+    mtx->mutex = (struct tMPI_Mutex*)malloc(sizeof(struct tMPI_Mutex)*1);
+    if (mtx->mutex == NULL)
+    {
+        return ENOMEM;
+    }
+    ret = pthread_mutex_init(&(mtx->mutex->mtx), NULL);
     if (ret != 0)
     {
-        tMPI_Fatal_error(TMPI_FARGS, "Error initializing POSIX mutex. rc=%d");
-        /* Use system memory allocation routines */
         return ret;
     }
 
+#ifndef TMPI_NO_ATOMICS
     tMPI_Atomic_set(&(mtx->initialized), 1);
+#else
+    mtx->initialized.value = 1;
+#endif
     return 0;
 }
 
-static int tMPI_Thread_mutex_init_once(tMPI_Thread_mutex_t *mtx)
+static inline int tMPI_Thread_mutex_init_once(tMPI_Thread_mutex_t *mtx)
 {
     int ret = 0;
 
-    /* we're relying on the memory barrier semantics of mutex_lock/unlock
-       for the check preceding this function call to have worked */
-    pthread_mutex_lock( &(mutex_init) );
-    if (mtx->mutex == NULL)
+#ifndef TMPI_NO_ATOMICS
+    /* check whether the mutex is initialized */
+    if (tMPI_Atomic_get( &(mtx->initialized)  ) == 0)
+#endif
     {
-        mtx->mutex = (struct tMPI_Mutex*)tMPI_Malloc(sizeof(struct tMPI_Mutex)*1);
-        ret        = pthread_mutex_init( &(mtx->mutex->mtx), NULL);
+        /* we're relying on the memory barrier semantics of mutex_lock/unlock
+           for the check preceding this function call to have worked */
+        ret = pthread_mutex_lock( &(mutex_init) );
+        if (ret != 0)
+        {
+            return ret;
+        }
+
+        if (mtx->mutex == NULL)
+        {
+            mtx->mutex = (struct tMPI_Mutex*)malloc(sizeof(struct tMPI_Mutex));
+            if (mtx->mutex == NULL)
+            {
+                ret = ENOMEM;
+                goto err;
+            }
+            ret = pthread_mutex_init( &(mtx->mutex->mtx), NULL);
+            if (ret != 0)
+            {
+                goto err;
+            }
+        }
     }
+    ret = pthread_mutex_unlock( &(mutex_init) );
+    return ret;
+err:
     pthread_mutex_unlock( &(mutex_init) );
     return ret;
 }
@@ -364,13 +414,11 @@ int tMPI_Thread_mutex_destroy(tMPI_Thread_mutex_t *mtx)
     }
 
     ret = pthread_mutex_destroy( &(mtx->mutex->mtx) );
-    free(mtx->mutex);
-
     if (ret != 0)
     {
-        tMPI_Fatal_error(TMPI_FARGS, "Error destroying POSIX mutex. rc=%d", ret);
-        /* Use system memory allocation routines */
+        return ret;
     }
+    free(mtx->mutex);
     return ret;
 }
 
@@ -381,17 +429,13 @@ int tMPI_Thread_mutex_lock(tMPI_Thread_mutex_t *mtx)
     int ret;
 
     /* check whether the mutex is initialized */
-    if (tMPI_Atomic_get( &(mtx->initialized)  ) == 0)
+    ret = tMPI_Thread_mutex_init_once(mtx);
+    if (ret != 0)
     {
-        ret = tMPI_Thread_mutex_init_once(mtx);
-        if (ret)
-        {
-            return ret;
-        }
+        return ret;
     }
 
     ret = pthread_mutex_lock(&(mtx->mutex->mtx));
-
     return ret;
 }
 
@@ -403,17 +447,13 @@ int tMPI_Thread_mutex_trylock(tMPI_Thread_mutex_t *mtx)
     int ret;
 
     /* check whether the mutex is initialized */
-    if (tMPI_Atomic_get( &(mtx->initialized)  ) == 0)
+    ret = tMPI_Thread_mutex_init_once(mtx);
+    if (ret != 0)
     {
-        ret = tMPI_Thread_mutex_init_once(mtx);
-        if (ret)
-        {
-            return ret;
-        }
+        return ret;
     }
 
     ret = pthread_mutex_trylock(&(mtx->mutex->mtx));
-
     return ret;
 }
 
@@ -424,17 +464,13 @@ int tMPI_Thread_mutex_unlock(tMPI_Thread_mutex_t *mtx)
     int ret;
 
     /* check whether the mutex is initialized */
-    if (tMPI_Atomic_get( &(mtx->initialized)  ) == 0)
+    ret = tMPI_Thread_mutex_init_once(mtx);
+    if (ret != 0)
     {
-        ret = tMPI_Thread_mutex_init_once(mtx);
-        if (ret)
-        {
-            return ret;
-        }
+        return ret;
     }
 
     ret = pthread_mutex_unlock(&(mtx->mutex->mtx));
-
     return ret;
 }
 
@@ -446,19 +482,20 @@ int tMPI_Thread_key_create(tMPI_Thread_key_t *key, void (*destructor)(void *))
 
     if (key == NULL)
     {
-        tMPI_Fatal_error(TMPI_FARGS, "Invalid key pointer.");
         return EINVAL;
     }
 
 
-    key->key = (struct tMPI_Thread_key*)tMPI_Malloc(sizeof(struct
-                                                           tMPI_Thread_key)*1);
+    key->key = (struct tMPI_Thread_key*)malloc(sizeof(struct
+                                                      tMPI_Thread_key)*1);
+    if (key->key == NULL)
+    {
+        return ENOMEM;
+    }
     ret = pthread_key_create(&((key)->key->pkey), destructor);
     if (ret != 0)
     {
-        tMPI_Fatal_error(TMPI_FARGS, "Failed to create thread key, rc=%d.", ret);
-        fflush(stderr);
-        return -1;
+        return ret;
     }
 
     tMPI_Atomic_set(&(key->initialized), 1);
@@ -471,15 +508,13 @@ int tMPI_Thread_key_delete(tMPI_Thread_key_t key)
     int ret;
 
     ret = pthread_key_delete((key.key->pkey));
-    free(key.key);
-
     if (ret != 0)
     {
-        tMPI_Fatal_error(TMPI_FARGS, "Failed to delete thread key, rc=%d.", ret);
-        fflush(stderr);
+        return ret;
     }
+    free(key.key);
 
-    return ret;
+    return 0;
 }
 
 
@@ -514,7 +549,8 @@ int tMPI_Thread_once(tMPI_Thread_once_t *once_control,
     }
 
     /* really ugly hack - and it's slow... */
-    if ( (ret = pthread_mutex_lock( &once_init )) )
+    ret = pthread_mutex_lock( &once_init );
+    if (ret != 0)
     {
         return ret;
     }
@@ -523,9 +559,9 @@ int tMPI_Thread_once(tMPI_Thread_once_t *once_control,
         (*init_routine)();
         tMPI_Atomic_set(&(once_control->once), 1);
     }
-    pthread_mutex_unlock( &once_init );
+    ret = pthread_mutex_unlock( &once_init );
 
-    return 0;
+    return ret;
 }
 
 
@@ -540,17 +576,22 @@ int tMPI_Thread_cond_init(tMPI_Thread_cond_t *cond)
         return EINVAL;
     }
 
-    cond->condp = (struct tMPI_Thread_cond*)
-        tMPI_Malloc(sizeof(struct tMPI_Thread_cond)*1);
-    ret = pthread_cond_init(&(cond->condp->cond), NULL);
+    cond->condp = (struct tMPI_Thread_cond*)malloc(
+                sizeof(struct tMPI_Thread_cond));
+    if (cond->condp == NULL)
+    {
+        return ENOMEM;
+    }
 
+    ret = pthread_cond_init(&(cond->condp->cond), NULL);
     if (ret != 0)
     {
-        tMPI_Fatal_error(TMPI_FARGS, "Error initializing POSIX condition variable. rc=%d", ret);
-        fflush(stderr);
+        return ret;
     }
     tMPI_Atomic_set(&(cond->initialized), 1);
-    return ret;
+    tMPI_Atomic_memory_barrier();
+
+    return 0;
 }
 
 
@@ -560,13 +601,30 @@ static int tMPI_Thread_cond_init_once(tMPI_Thread_cond_t *cond)
 
     /* we're relying on the memory barrier semantics of mutex_lock/unlock
        for the check preceding this function call to have worked */
-    pthread_mutex_lock( &(cond_init) );
+    ret = pthread_mutex_lock( &(cond_init) );
+    if (ret != 0)
+    {
+        return ret;
+    }
     if (cond->condp == NULL)
     {
         cond->condp = (struct tMPI_Thread_cond*)
-            tMPI_Malloc(sizeof(struct tMPI_Thread_cond)*1);
+            malloc(sizeof(struct tMPI_Thread_cond)*1);
+        if (cond->condp == NULL)
+        {
+            ret = ENOMEM;
+            goto err;
+        }
         ret = pthread_cond_init( &(cond->condp->cond), NULL);
+        if (ret != 0)
+        {
+            goto err;
+        }
     }
+    ret = pthread_mutex_unlock( &(cond_init) );
+    return ret;
+err:
+    /* try to unlock anyway */
     pthread_mutex_unlock( &(cond_init) );
     return ret;
 }
@@ -583,16 +641,13 @@ int tMPI_Thread_cond_destroy(tMPI_Thread_cond_t *cond)
     }
 
     ret = pthread_cond_destroy(&(cond->condp->cond));
-    free(cond->condp);
-
     if (ret != 0)
     {
-        tMPI_Fatal_error(TMPI_FARGS,
-                         "Error destroying POSIX condition variable. rc=%d",
-                         ret);
-        fflush(stderr);
+        return ret;
     }
-    return ret;
+    free(cond->condp);
+
+    return 0;
 }
 
 
@@ -603,7 +658,11 @@ int tMPI_Thread_cond_wait(tMPI_Thread_cond_t *cond, tMPI_Thread_mutex_t *mtx)
     /* check whether the condition is initialized */
     if (tMPI_Atomic_get( &(cond->initialized)  ) == 0)
     {
-        tMPI_Thread_cond_init_once(cond);
+        ret = tMPI_Thread_cond_init_once(cond);
+        if (ret != 0)
+        {
+            return ret;
+        }
     }
     /* the mutex must have been initialized because it should be locked here */
 
@@ -622,7 +681,11 @@ int tMPI_Thread_cond_signal(tMPI_Thread_cond_t *cond)
     /* check whether the condition is initialized */
     if (tMPI_Atomic_get( &(cond->initialized)  ) == 0)
     {
-        tMPI_Thread_cond_init_once(cond);
+        ret = tMPI_Thread_cond_init_once(cond);
+        if (ret != 0)
+        {
+            return ret;
+        }
     }
 
     ret = pthread_cond_signal( &(cond->condp->cond) );
@@ -639,7 +702,11 @@ int tMPI_Thread_cond_broadcast(tMPI_Thread_cond_t *cond)
     /* check whether the condition is initialized */
     if (tMPI_Atomic_get( &(cond->initialized)  ) == 0)
     {
-        tMPI_Thread_cond_init_once(cond);
+        ret = tMPI_Thread_cond_init_once(cond);
+        if (ret != 0)
+        {
+            return ret;
+        }
     }
 
     ret = pthread_cond_broadcast( &(cond->condp->cond) );
@@ -675,23 +742,21 @@ int tMPI_Thread_barrier_init(tMPI_Thread_barrier_t *barrier, int n)
     }
 
     barrier->barrierp = (struct tMPI_Thread_barrier*)
-        tMPI_Malloc(sizeof(struct tMPI_Thread_barrier)*1);
-    ret = pthread_mutex_init(&(barrier->barrierp->mutex), NULL);
+        malloc(sizeof(struct tMPI_Thread_barrier)*1);
+    if (barrier->barrierp == NULL)
+    {
+        return ENOMEM;
+    }
 
+    ret = pthread_mutex_init(&(barrier->barrierp->mutex), NULL);
     if (ret != 0)
     {
-        tMPI_Fatal_error(TMPI_FARGS, "Error initializing POSIX mutex. rc=%d",
-                         ret);
         return ret;
     }
 
     ret = pthread_cond_init(&(barrier->barrierp->cv), NULL);
-
     if (ret != 0)
     {
-        tMPI_Fatal_error(TMPI_FARGS,
-                         "Error initializing POSIX condition variable. rc=%d",
-                         ret);
         return ret;
     }
 
@@ -709,30 +774,39 @@ static int tMPI_Thread_barrier_init_once(tMPI_Thread_barrier_t *barrier)
 
     /* we're relying on the memory barrier semantics of mutex_lock/unlock
        for the check preceding this function call to have worked */
-    pthread_mutex_lock( &(barrier_init) );
+    ret = pthread_mutex_lock( &(barrier_init) );
+    if (ret != 0)
+    {
+        return ret;
+    }
+
     if (barrier->barrierp == NULL)
     {
         barrier->barrierp = (struct tMPI_Thread_barrier*)
-            tMPI_Malloc(sizeof(struct tMPI_Thread_barrier)*1);
+            malloc(sizeof(struct tMPI_Thread_barrier)*1);
+        if (barrier->barrierp == NULL)
+        {
+            ret = ENOMEM;
+            goto err;
+        }
+
         ret = pthread_mutex_init(&(barrier->barrierp->mutex), NULL);
 
         if (ret != 0)
         {
-            tMPI_Fatal_error(TMPI_FARGS, "Error initializing POSIX mutex. rc=%d",
-                             ret);
-            return ret;
+            goto err;
         }
 
         ret = pthread_cond_init(&(barrier->barrierp->cv), NULL);
 
         if (ret != 0)
         {
-            tMPI_Fatal_error(TMPI_FARGS,
-                             "Error initializing POSIX condition variable. rc=%d",
-                             ret);
-            return ret;
+            goto err;
         }
     }
+    ret = pthread_mutex_unlock( &(barrier_init) );
+    return ret;
+err:
     pthread_mutex_unlock( &(barrier_init) );
     return ret;
 }
@@ -742,13 +816,23 @@ static int tMPI_Thread_barrier_init_once(tMPI_Thread_barrier_t *barrier)
 
 int tMPI_Thread_barrier_destroy(tMPI_Thread_barrier_t *barrier)
 {
+    int ret;
+
     if (barrier == NULL)
     {
         return EINVAL;
     }
 
-    pthread_mutex_destroy(&(barrier->barrierp->mutex));
-    pthread_cond_destroy(&(barrier->barrierp->cv));
+    ret = pthread_mutex_destroy(&(barrier->barrierp->mutex));
+    if (ret != 0)
+    {
+        return ret;
+    }
+    ret = pthread_cond_destroy(&(barrier->barrierp->cv));
+    if (ret != 0)
+    {
+        return ret;
+    }
 
     free(barrier->barrierp);
 
@@ -756,10 +840,10 @@ int tMPI_Thread_barrier_destroy(tMPI_Thread_barrier_t *barrier)
 }
 
 
-int tMPI_Thread_barrier_wait(tMPI_Thread_barrier_t *   barrier)
+int tMPI_Thread_barrier_wait(tMPI_Thread_barrier_t * barrier)
 {
-    int    cycle;
-    int    rc;
+    int cycle;
+    int ret;
 
     /* check whether the barrier is initialized */
     if (tMPI_Atomic_get( &(barrier->initialized)  ) == 0)
@@ -768,12 +852,10 @@ int tMPI_Thread_barrier_wait(tMPI_Thread_barrier_t *   barrier)
     }
 
 
-    rc = pthread_mutex_lock(&barrier->barrierp->mutex);
-
-
-    if (rc != 0)
+    ret = pthread_mutex_lock(&barrier->barrierp->mutex);
+    if (ret != 0)
     {
-        return EBUSY;
+        return ret;
     }
 
     cycle = barrier->cycle;
@@ -785,28 +867,32 @@ int tMPI_Thread_barrier_wait(tMPI_Thread_barrier_t *   barrier)
     {
         barrier->cycle = !barrier->cycle;
         barrier->count = barrier->threshold;
-        r            = pthread_cond_broadcast(&barrier->barrierp->cv);
+        ret            = pthread_cond_broadcast(&barrier->barrierp->cv);
 
-        if (rc == 0)
+        if (ret == 0)
         {
-            rc = -1;
+            goto err;
         }
     }
     else
     {
         while (cycle == barrier->cycle)
         {
-            rc = pthread_cond_wait(&barrier->barrierp->cv,
-                                   &barrier->barrierp->mutex);
-            if (rc != 0)
+            ret = pthread_cond_wait(&barrier->barrierp->cv,
+                                    &barrier->barrierp->mutex);
+            if (ret != 0)
             {
-                break;
+                goto err;
             }
         }
     }
 
+    ret = pthread_mutex_unlock(&barrier->barrierp->mutex);
+    return ret;
+err:
     pthread_mutex_unlock(&barrier->barrierp->mutex);
-    return rc;
+    return ret;
+
 }
 
 #else
index dab23f0becdfe0069b0227d48056fe3c17113ffa..807f85a3959255e769b1bd306cf59c7dbace76aa 100644 (file)
@@ -149,8 +149,7 @@ int tMPI_Scatter(void* sendbuf, int sendcount, tMPI_Datatype sendtype,
                         &(tMPI_Get_thread(comm, myrank)->cbl_multi));
             if (cev->met[myrank].cb->size < total_send_size)
             {
-                fprintf(stderr, "ERROR: cb size too small\n");
-                exit(1);
+                return tMPI_Error(comm, TMPI_ERR_COPY_BUFFER_SIZE);
             }
             /* copy to the new buf */
             memcpy(cev->met[myrank].cb->buf, sendbuf, total_send_size);
@@ -286,8 +285,7 @@ int tMPI_Scatterv(void* sendbuf, int *sendcounts, int *displs,
                         &(tMPI_Get_thread(comm, myrank)->cbl_multi));
             if (cev->met[myrank].cb->size < total_send_size)
             {
-                fprintf(stderr, "ERROR: cb size too small\n");
-                exit(1);
+                return tMPI_Error(comm, TMPI_ERR_COPY_BUFFER_SIZE);
             }
             /* copy to the new buf */
             memcpy(cev->met[myrank].cb->buf, sendbuf, total_send_size);
index e6170d497ae80e5a0c51ac541f58c37acde0d9ef..986a516b27caacee090d17b738b0b6ecfd67f7e0 100644 (file)
 
 /* whether to warn if there are mallocs at performance-critical sections
    (due to preallocations being too small) */
+#ifdef TMPI_WARNINGS
+#define TMPI_WARN_MALLOC
+#else
 /*#define TMPI_WARN_MALLOC*/
+#endif
 
 
 /* the number of envelopes to allocate per thread-to-thread path */
index 428aebe256e27483a809e2184a5cc7b9b66de2ad..88bcb84f8bad7f979341e3cf7f97e03376b8a8b5 100644 (file)
 #include <cstring>
 #include <cstdlib>
 #include <stdexcept>
+#include <string>
 #include "thread_mpi/system_error.h"
 
 tMPI::system_error::system_error(error_code ec)
-    : runtime_error(std::strerror(ec)), ec_(ec)
+    : runtime_error(std::string(std::strerror(ec))), ec_(ec)
 {
 }
 
index 4fa6169513aa9fdd0f078c6aab38ccdda6a1c932..4d15b931de4c32bc5aa86bc670e51c67fc2d8846 100644 (file)
@@ -98,18 +98,18 @@ struct tmpi_global *tmpi_global = NULL;
 
 
 /* start N threads with argc, argv (used by tMPI_Init)*/
-void tMPI_Start_threads(tmpi_bool main_returns, int N,
-                        tMPI_Affinity_strategy aff_strategy,
-                        int *argc, char ***argv,
-                        void (*start_fn)(void*), void *start_arg,
-                        int (*start_fn_main)(int, char**));
+int tMPI_Start_threads(tmpi_bool main_returns, int N,
+                       tMPI_Affinity_strategy aff_strategy,
+                       int *argc, char ***argv,
+                       void (*start_fn)(void*), void *start_arg,
+                       int (*start_fn_main)(int, char**));
 
 /* starter function for threads; takes a void pointer to a
       struct tmpi_starter_, which calls main() if tmpi_start_.fn == NULL */
 static void* tMPI_Thread_starter(void *arg);
 
 /* allocate and initialize the data associated with a thread structure */
-static void tMPI_Thread_init(struct tmpi_thread *th);
+static int tMPI_Thread_init(struct tmpi_thread *th);
 /* deallocate the data associated with a thread structure */
 static void tMPI_Thread_destroy(struct tmpi_thread *th);
 
@@ -123,6 +123,7 @@ void tMPI_Trace_print(const char *fmt, ...)
     struct tmpi_thread       * th  = NULL;
     static tMPI_Thread_mutex_t mtx = TMPI_THREAD_MUTEX_INITIALIZER;
 
+    /* don't check for errors during trace */
     tMPI_Thread_mutex_lock(&mtx);
     if (threads)
     {
@@ -143,41 +144,6 @@ void tMPI_Trace_print(const char *fmt, ...)
 #endif
 
 
-#if 0
-struct tmpi_thread *tMPI_Get_current(void)
-{
-    if (!threads)
-    {
-        return NULL;
-    }
-
-    return (struct tmpi_thread*)tMPI_thread_getspecific(id_key);
-}
-
-
-unsigned int tMPI_Threadnr(struct tmpi_thread *thr)
-{
-    return thr-threads;
-}
-#endif
-#if 0
-unsigned int tMPI_This_threadnr(void)
-{
-    return tMPI_Get_current()-threads;
-}
-
-struct tmpi_thread *tMPI_Get_thread(tMPI_Comm comm, int rank)
-{
-    /* check destination */
-    if ( (rank < 0) || (rank > comm->grp.N) )
-    {
-        tMPI_Error(comm, TMPI_ERR_GROUP_RANK);
-        return NULL;
-    }
-    return comm->grp.peers[rank];
-}
-#endif
-
 tmpi_bool tMPI_Is_master(void)
 {
     /* if there are no other threads, we're the main thread */
@@ -243,30 +209,55 @@ int tMPI_Get_N(int *argc, char ***argv, const char *optname, int *nthreads)
     return ret;
 }
 
-static void tMPI_Thread_init(struct tmpi_thread *th)
+static int tMPI_Thread_init(struct tmpi_thread *th)
 {
+    int ret;
     int N_envelopes      = (Nthreads+1)*N_EV_ALLOC;
     int N_send_envelopes = N_EV_ALLOC;
     int N_reqs           = (Nthreads+1)*N_EV_ALLOC;
     int i;
 
     /* we set our thread id, as a thread-specific piece of global data. */
-    tMPI_Thread_setspecific(id_key, th);
+    ret = tMPI_Thread_setspecific(id_key, th);
+    if (ret != 0)
+    {
+        return ret;
+    }
 
     /* allocate comm.self */
-    th->self_comm               = tMPI_Comm_alloc(TMPI_COMM_WORLD, 1);
+    ret = tMPI_Comm_alloc( &(th->self_comm), TMPI_COMM_WORLD, 1);
+    if (ret != TMPI_SUCCESS)
+    {
+        return ret;
+    }
     th->self_comm->grp.peers[0] = th;
 
     /* allocate envelopes */
-    tMPI_Free_env_list_init( &(th->envelopes), N_envelopes );
+    ret = tMPI_Free_env_list_init( &(th->envelopes), N_envelopes );
+    if (ret != TMPI_SUCCESS)
+    {
+        return ret;
+    }
     /* recv list */
-    tMPI_Recv_env_list_init( &(th->evr));
+    ret = tMPI_Recv_env_list_init( &(th->evr));
+    if (ret != TMPI_SUCCESS)
+    {
+        return ret;
+    }
     /* send lists */
     th->evs = (struct send_envelope_list*)tMPI_Malloc(
                 sizeof(struct send_envelope_list)*Nthreads);
+    if (th->evs == NULL)
+    {
+        return TMPI_ERR_NO_MEM;
+    }
     for (i = 0; i < Nthreads; i++)
     {
-        tMPI_Send_env_list_init( &(th->evs[i]), N_send_envelopes);
+        ret = tMPI_Send_env_list_init( &(th->evs[i]), N_send_envelopes);
+        if (ret != TMPI_SUCCESS)
+        {
+            return ret;
+        }
     }
 
     tMPI_Atomic_set( &(th->ev_outgoing_received), 0);
@@ -274,20 +265,39 @@ static void tMPI_Thread_init(struct tmpi_thread *th)
     tMPI_Event_init( &(th->p2p_event) );
 
     /* allocate requests */
-    tMPI_Req_list_init(&(th->rql), N_reqs);
+    ret = tMPI_Req_list_init(&(th->rql), N_reqs);
+    if (ret != TMPI_SUCCESS)
+    {
+        return ret;
+    }
+
 
 #ifdef USE_COLLECTIVE_COPY_BUFFER
     /* allcate copy_buffer list */
-    tMPI_Copy_buffer_list_init(&(th->cbl_multi), (Nthreads+1)*(N_COLL_ENV+1),
-                               Nthreads*COPY_BUFFER_SIZE);
+    ret = tMPI_Copy_buffer_list_init(&(th->cbl_multi),
+                                     (Nthreads+1)*(N_COLL_ENV+1),
+                                     Nthreads*COPY_BUFFER_SIZE);
+    if (ret != TMPI_SUCCESS)
+    {
+        return ret;
+    }
 #endif
 
 #ifdef TMPI_PROFILE
-    tMPI_Profile_init(&(th->profile));
+    ret = tMPI_Profile_init(&(th->profile));
+    if (ret != TMPI_SUCCESS)
+    {
+        return ret;
+    }
 #endif
     /* now wait for all other threads to come on line, before we
        start the MPI program */
-    tMPI_Thread_barrier_wait( &(tmpi_global->barrier) );
+    ret = tMPI_Thread_barrier_wait( &(tmpi_global->barrier) );
+    if (ret != 0)
+    {
+        return ret;;
+    }
+    return ret;
 }
 
 
@@ -315,17 +325,32 @@ static void tMPI_Thread_destroy(struct tmpi_thread *th)
     }
 }
 
-static void tMPI_Global_init(struct tmpi_global *g, int Nthreads)
+static int tMPI_Global_init(struct tmpi_global *g, int Nthreads)
 {
+    int ret;
+
     g->usertypes        = NULL;
     g->N_usertypes      = 0;
     g->Nalloc_usertypes = 0;
-    tMPI_Thread_mutex_init(&(g->timer_mutex));
+    ret                 = tMPI_Thread_mutex_init(&(g->timer_mutex));
+    if (ret != 0)
+    {
+        return tMPI_Error(TMPI_COMM_WORLD, TMPI_ERR_IO);
+    }
     tMPI_Spinlock_init(&(g->datatype_lock));
 
-    tMPI_Thread_barrier_init( &(g->barrier), Nthreads);
+    ret = tMPI_Thread_barrier_init( &(g->barrier), Nthreads);
+    if (ret != 0)
+    {
+        return tMPI_Error(TMPI_COMM_WORLD, TMPI_ERR_IO);
+    }
+
+    ret = tMPI_Thread_mutex_init(&(g->comm_link_lock));
+    if (ret != 0)
+    {
+        return tMPI_Error(TMPI_COMM_WORLD, TMPI_ERR_IO);
+    }
 
-    tMPI_Thread_mutex_init(&(g->comm_link_lock));
 
 #if !(defined( _WIN32 ) || defined( _WIN64 ) )
     /* the time at initialization. */
@@ -334,7 +359,7 @@ static void tMPI_Global_init(struct tmpi_global *g, int Nthreads)
     /* the time at initialization. */
     g->timer_init = GetTickCount();
 #endif
-
+    return TMPI_SUCCESS;
 }
 
 static void tMPI_Global_destroy(struct tmpi_global *g)
@@ -349,13 +374,18 @@ static void tMPI_Global_destroy(struct tmpi_global *g)
 
 static void* tMPI_Thread_starter(void *arg)
 {
+    int                 ret;
     struct tmpi_thread *th = (struct tmpi_thread*)arg;
 
 #ifdef TMPI_TRACE
     tMPI_Trace_print("Created thread nr. %d", (int)(th-threads));
 #endif
 
-    tMPI_Thread_init(th);
+    ret = tMPI_Thread_init(th);
+    if (ret != TMPI_SUCCESS)
+    {
+        return NULL;
+    }
 
     /* start_fn, start_arg, argc and argv were set by the calling function */
     if (!th->start_fn)
@@ -371,16 +401,17 @@ static void* tMPI_Thread_starter(void *arg)
         }
     }
 
-    return 0;
+    return NULL;
 }
 
 
-void tMPI_Start_threads(tmpi_bool main_returns, int N,
-                        tMPI_Affinity_strategy aff_strategy,
-                        int *argc, char ***argv,
-                        void (*start_fn)(void*), void *start_arg,
-                        int (*start_fn_main)(int, char**))
+int tMPI_Start_threads(tmpi_bool main_returns, int N,
+                       tMPI_Affinity_strategy aff_strategy,
+                       int *argc, char ***argv,
+                       void (*start_fn)(void*), void *start_arg,
+                       int (*start_fn_main)(int, char**))
 {
+    int ret;
 #ifdef TMPI_TRACE
     tMPI_Trace_print("tMPI_Start_threads(%d, %d, %d, %d, %d, %p, %p, %p, %p)",
                      main_returns, N, aff_strategy, argc, argv, start_fn,
@@ -397,16 +428,33 @@ void tMPI_Start_threads(tmpi_bool main_returns, int N,
         /* allocate global data */
         tmpi_global = (struct tmpi_global*)
             tMPI_Malloc(sizeof(struct tmpi_global));
-        tMPI_Global_init(tmpi_global, N);
+        if (tmpi_global == 0)
+        {
+            return TMPI_ERR_NO_MEM;
+        }
+        ret = tMPI_Global_init(tmpi_global, N);
+        if (ret != TMPI_SUCCESS)
+        {
+            return ret;
+        }
 
         /* allocate world and thread data */
-        threads          = (struct tmpi_thread*)tMPI_Malloc(sizeof(struct tmpi_thread)*N);
-        TMPI_COMM_WORLD  = tMPI_Comm_alloc(NULL, N);
+        threads = (struct tmpi_thread*)
+            tMPI_Malloc(sizeof(struct tmpi_thread)*N);
+        if (threads == NULL)
+        {
+            return TMPI_ERR_NO_MEM;
+        }
+        ret = tMPI_Comm_alloc(&TMPI_COMM_WORLD, NULL, N);
+        if (ret != TMPI_SUCCESS)
+        {
+            return ret;
+        }
         TMPI_GROUP_EMPTY = tMPI_Group_alloc();
 
         if (tMPI_Thread_key_create(&id_key, NULL))
         {
-            tMPI_Error(TMPI_COMM_WORLD, TMPI_ERR_INIT);
+            return tMPI_Error(TMPI_COMM_WORLD, TMPI_ERR_INIT);
         }
         for (i = 0; i < N; i++)
         {
@@ -458,7 +506,6 @@ void tMPI_Start_threads(tmpi_bool main_returns, int N,
 
         for (i = 1; i < N; i++) /* zero is the main thread */
         {
-            int ret;
             ret = tMPI_Thread_create(&(threads[i].thread_id),
                                      tMPI_Thread_starter,
                                      (void*)&(threads[i]) );
@@ -467,9 +514,9 @@ void tMPI_Start_threads(tmpi_bool main_returns, int N,
             {
                 tMPI_Thread_setaffinity_single(threads[i].thread_id, i);
             }
-            if (ret)
+            if (ret != TMPI_SUCCESS)
             {
-                tMPI_Error(TMPI_COMM_WORLD, TMPI_ERR_INIT);
+                return tMPI_Error(TMPI_COMM_WORLD, TMPI_ERR_INIT);
             }
         }
         /* the main thread also runs start_fn if we don't want
@@ -477,18 +524,25 @@ void tMPI_Start_threads(tmpi_bool main_returns, int N,
         if (!main_returns)
         {
             tMPI_Thread_starter((void*)&(threads[0]));
+
         }
         else
         {
-            tMPI_Thread_init(&(threads[0]));
+            ret = tMPI_Thread_init(&(threads[0]));
+            if (ret != 0)
+            {
+                return ret;
+            }
         }
     }
+    return TMPI_SUCCESS;
 }
 
 
 int tMPI_Init(int *argc, char ***argv,
               int (*start_function)(int, char**))
 {
+    int ret;
 #ifdef TMPI_TRACE
     tMPI_Trace_print("tMPI_Init(%p, %p, %p)", argc, argv, start_function);
 #endif
@@ -497,8 +551,12 @@ int tMPI_Init(int *argc, char ***argv,
     {
         int N = 0;
         tMPI_Get_N(argc, argv, "-nt", &N);
-        tMPI_Start_threads(TRUE, N, TMPI_AFFINITY_ALL_CORES, argc, argv,
-                           NULL, NULL, start_function);
+        ret = tMPI_Start_threads(TRUE, N, TMPI_AFFINITY_ALL_CORES, argc, argv,
+                                 NULL, NULL, start_function) != 0;
+        if (ret != 0)
+        {
+            return ret;
+        }
     }
     else
     {
@@ -516,6 +574,7 @@ int tMPI_Init_fn(int main_thread_returns, int N,
                  tMPI_Affinity_strategy aff_strategy,
                  void (*start_function)(void*), void *arg)
 {
+    int ret;
 #ifdef TMPI_TRACE
     tMPI_Trace_print("tMPI_Init_fn(%d, %p, %p)", N, start_function, arg);
 #endif
@@ -531,8 +590,12 @@ int tMPI_Init_fn(int main_thread_returns, int N,
 
     if (TMPI_COMM_WORLD == 0 && N >= 1) /* we're the main process */
     {
-        tMPI_Start_threads(main_thread_returns, N, aff_strategy,
-                           0, 0, start_function, arg, NULL);
+        ret = tMPI_Start_threads(main_thread_returns, N, aff_strategy,
+                                 0, 0, start_function, arg, NULL);
+        if (ret != 0)
+        {
+            return ret;
+        }
     }
     return TMPI_SUCCESS;
 }
@@ -551,6 +614,7 @@ int tMPI_Initialized(int *flag)
 int tMPI_Finalize(void)
 {
     int i;
+    int ret;
 #ifdef TMPI_TRACE
     tMPI_Trace_print("tMPI_Finalize()");
 #endif
@@ -564,7 +628,11 @@ int tMPI_Finalize(void)
         struct tmpi_thread *cur = tMPI_Get_current();
 
         tMPI_Profile_stop( &(cur->profile) );
-        tMPI_Thread_barrier_wait( &(tmpi_global->barrier) );
+        ret = tMPI_Thread_barrier_wait( &(tmpi_global->barrier) );
+        if (ret != 0)
+        {
+            return tMPI_Error(TMPI_COMM_WORLD, TMPI_ERR_IO);
+        }
 
         if (tMPI_Is_master())
         {
@@ -572,7 +640,13 @@ int tMPI_Finalize(void)
         }
     }
 #endif
-    tMPI_Thread_barrier_wait( &(tmpi_global->barrier) );
+    ret = tMPI_Thread_barrier_wait( &(tmpi_global->barrier) );
+    if (ret != 0)
+    {
+        return tMPI_Error(TMPI_COMM_WORLD, TMPI_ERR_IO);
+    }
+
+
 
     if (tMPI_Is_master())
     {
@@ -583,7 +657,7 @@ int tMPI_Finalize(void)
         {
             if (tMPI_Thread_join(threads[i].thread_id, NULL))
             {
-                tMPI_Error(TMPI_COMM_WORLD, TMPI_ERR_FINALIZE);
+                return tMPI_Error(TMPI_COMM_WORLD, TMPI_ERR_FINALIZE);
             }
             tMPI_Thread_destroy(&(threads[i]));
         }
@@ -597,16 +671,35 @@ int tMPI_Finalize(void)
         {
             tMPI_Comm cur;
 
-            tMPI_Thread_mutex_lock(&(tmpi_global->comm_link_lock));
+            ret = tMPI_Thread_mutex_lock(&(tmpi_global->comm_link_lock));
+            if (ret != 0)
+            {
+                return tMPI_Error(TMPI_COMM_WORLD, TMPI_ERR_IO);
+            }
             cur = TMPI_COMM_WORLD->next;
             while (cur && (cur != TMPI_COMM_WORLD) )
             {
                 tMPI_Comm next = cur->next;
-                tMPI_Comm_destroy(cur, FALSE);
+                ret = tMPI_Comm_destroy(cur, FALSE);
+                if (ret != 0)
+                {
+                    tMPI_Thread_mutex_unlock(&(tmpi_global->comm_link_lock));
+                    return ret;
+                }
                 cur = next;
             }
-            tMPI_Comm_destroy(TMPI_COMM_WORLD, FALSE);
-            tMPI_Thread_mutex_unlock(&(tmpi_global->comm_link_lock));
+            ret = tMPI_Comm_destroy(TMPI_COMM_WORLD, FALSE);
+            if (ret != 0)
+            {
+                tMPI_Thread_mutex_unlock(&(tmpi_global->comm_link_lock));
+                return ret;
+            }
+            ret = tMPI_Thread_mutex_unlock(&(tmpi_global->comm_link_lock));
+            if (ret != 0)
+            {
+                return tMPI_Error(TMPI_COMM_WORLD, TMPI_ERR_IO);
+            }
+
         }
 
         tMPI_Group_free(&TMPI_GROUP_EMPTY);
@@ -670,7 +763,8 @@ int tMPI_Abort(tMPI_Comm comm, int errorcode)
         }
         else
         {
-            fprintf(stderr, "tMPI_Abort called on main thread with errorcode=%d\n",
+            fprintf(stderr,
+                    "tMPI_Abort called on main thread with errorcode=%d\n",
                     errorcode);
         }
         fflush(stderr);
@@ -805,12 +899,6 @@ double tMPI_Wtick(void)
 #endif
 }
 
-
-
-
-
-
-
 int tMPI_Get_count(tMPI_Status *status, tMPI_Datatype datatype, int *count)
 {
 #ifdef TMPI_TRACE
index cc1a2c2c21a35bba2ab6724d772a8d3f089806c1..df105f0a31b9e6f983df33d6654da9a91f4cf590 100644 (file)
@@ -66,7 +66,7 @@ void *tMPI_Malloc(size_t size)
 
     if (!ret)
     {
-        tMPI_Error(TMPI_COMM_WORLD, TMPI_ERR_MALLOC);
+        tMPI_Error(TMPI_COMM_WORLD, TMPI_ERR_NO_MEM);
     }
     return ret;
 }
@@ -76,7 +76,7 @@ void *tMPI_Realloc(void *p, size_t size)
     void *ret = (void*)realloc(p, size);
     if (!ret)
     {
-        tMPI_Error(TMPI_COMM_WORLD, TMPI_ERR_MALLOC);
+        tMPI_Error(TMPI_COMM_WORLD, TMPI_ERR_NO_MEM);
     }
     return ret;
 }
index c944a434444c23ff01f8e2545b5546ba3eb179b6..f78dd73553c3a340f69490c0a00f4a012234d527 100644 (file)
@@ -357,11 +357,10 @@ int tMPI_Init_NUMA(void)
 
     /* allocate array of processor info blocks */
 
-    pMPI_ProcessorInfo = tMPI_Malloc( sizeof(MPI_NUMA_PROCESSOR_INFO) *
-                                      dwTotalProcessors );
+    pMPI_ProcessorInfo = malloc( sizeof(MPI_NUMA_PROCESSOR_INFO) *
+                                 dwTotalProcessors );
     if (pMPI_ProcessorInfo == NULL)
     {
-        tMPI_Fatal_error(TMPI_FARGS, "tMPI_Malloc failed for processor information");
         goto cleanup;
     }
 
@@ -407,17 +406,11 @@ int tMPI_Init_NUMA(void)
 
             if (!func_GetNumaProcessorNodeEx(pProcessorNumber, pNodeNumber))
             {
-                tMPI_Fatal_error(TMPI_FARGS,
-                                 "Processor enumeration, GetNumaProcessorNodeEx failed, error code=%d",
-                                 GetLastError());
                 goto cleanup;
             }
 
             if (!func_GetNumaNodeProcessorMaskEx(*pNodeNumber, pGroupAffinity))
             {
-                tMPI_Fatal_error(TMPI_FARGS,
-                                 "Processor enumeration, GetNumaNodeProcessorMaskEx failed, error code=%d",
-                                 GetLastError());
                 goto cleanup;
             }
 
@@ -431,7 +424,6 @@ int tMPI_Init_NUMA(void)
 
             if (i > dwTotalProcessors)
             {
-                tMPI_Fatal_error(TMPI_FARGS, "Processor enumeration exceeds allocated memory!");
                 goto cleanup;
             }
         }
@@ -459,23 +451,29 @@ cleanup:
     return 0;
 }
 
-static void tMPI_Thread_id_list_init(void)
+static int tMPI_Thread_id_list_init(void)
 {
+    int ret = 0;
+
     EnterCriticalSection( &thread_id_list_lock );
 
     N_thread_id_list      = 0;
     Nalloc_thread_id_list = 4; /* number of initial allocation*/
-    thread_id_list        = (thread_id_list_t*)tMPI_Malloc(
-                sizeof(thread_id_list_t)*
-                Nalloc_thread_id_list);
+    thread_id_list        = (thread_id_list_t*)malloc(sizeof(thread_id_list_t)*
+                                                      Nalloc_thread_id_list);
+    if (thread_id_list == NULL)
+    {
+        ret = ENOMEM;
+    }
 
     LeaveCriticalSection( &thread_id_list_lock );
+    return ret;
 }
 
 
 /* add an entry to the thread ID list, assuming it's locked */
-static void tMPI_Thread_id_list_add_locked(DWORD               thread_id,
-                                           struct tMPI_Thread *th)
+static int tMPI_Thread_id_list_add_locked(DWORD               thread_id,
+                                          struct tMPI_Thread *th)
 {
     if (Nalloc_thread_id_list < N_thread_id_list + 1)
     {
@@ -484,9 +482,13 @@ static void tMPI_Thread_id_list_add_locked(DWORD               thread_id,
 
         /* double the size */
         Nalloc_thread_id_list *= 2;
-        new_list               = (thread_id_list_t*)tMPI_Malloc(
-                    sizeof(thread_id_list_t)*
-                    Nalloc_thread_id_list);
+        /* and allocate the new list */
+        new_list = (thread_id_list_t*)malloc(sizeof(thread_id_list_t)*
+                                             Nalloc_thread_id_list);
+        if (new_list == NULL)
+        {
+            return ENOMEM;
+        }
         /* and copy over all elements */
         for (i = 0; i < N_thread_id_list; i++)
         {
@@ -500,19 +502,22 @@ static void tMPI_Thread_id_list_add_locked(DWORD               thread_id,
     thread_id_list[ N_thread_id_list ].th        = th;
     N_thread_id_list++;
 
-
+    return 0;
 }
 
 
 /* add an entry to the thread ID list */
-static void tMPI_Thread_id_list_add(DWORD thread_id, struct tMPI_Thread *th)
+static int tMPI_Thread_id_list_add(DWORD thread_id, struct tMPI_Thread *th)
 {
+    int ret = 0;
     EnterCriticalSection( &thread_id_list_lock );
-    tMPI_Thread_id_list_add_locked(thread_id, th);
+    ret = tMPI_Thread_id_list_add_locked(thread_id, th);
     LeaveCriticalSection( &thread_id_list_lock );
+    return ret;
 }
 
-/* Remove an entry from the thread_id list, assuming it's locked */
+/* Remove an entry from the thread_id list, assuming it's locked.
+   Does nothing if an entry is not found.*/
 static void tMPI_Thread_id_list_remove_locked(DWORD thread_id)
 {
     int       i;
@@ -575,17 +580,17 @@ static struct tMPI_Thread *tMPI_Thread_id_list_find(DWORD thread_id)
 
     EnterCriticalSection( &thread_id_list_lock );
     ret = tMPI_Thread_id_list_find_locked(thread_id);
-
     LeaveCriticalSection( &thread_id_list_lock );
     return ret;
 }
 
 /* try to add the running thread to the list. Returns the tMPI_Thrread struct
-   associated with this thread.*/
+   associated with this thread, or NULL in case of an error.*/
 static struct tMPI_Thread *tMPI_Thread_id_list_add_self(void)
 {
     DWORD               thread_id;
     struct tMPI_Thread *th = NULL;
+    int                 ret;
 
     EnterCriticalSection( &thread_id_list_lock );
 
@@ -594,7 +599,7 @@ static struct tMPI_Thread *tMPI_Thread_id_list_add_self(void)
     if (th == NULL)
     {
         /* if not, create an ID, set it and return it */
-        th = (struct tMPI_Thread*)tMPI_Malloc(sizeof(struct tMPI_Thread)*1);
+        th = (struct tMPI_Thread*)malloc(sizeof(struct tMPI_Thread)*1);
 
         /* to create a handle that can be used outside of the current
            thread, the handle from GetCurrentThread() must first
@@ -609,17 +614,23 @@ static struct tMPI_Thread *tMPI_Thread_id_list_add_self(void)
 
         /* This causes a small memory leak that is hard to fix. */
         th->started_by_tmpi = 0;
-        tMPI_Thread_id_list_add_locked(thread_id, th);
+        ret                 = tMPI_Thread_id_list_add_locked(thread_id, th);
+        if (ret != 0)
+        {
+            free(th);
+            th = NULL;
+        }
     }
     LeaveCriticalSection( &thread_id_list_lock );
-
     return th;
 }
 
 
-static void tMPI_Init_initers(void)
+static int tMPI_Init_initers(void)
 {
     int state;
+    int ret = 0;
+
     /* we can pre-check because it's atomic */
     if (tMPI_Atomic_get(&init_inited) == 0)
     {
@@ -636,11 +647,18 @@ static void tMPI_Init_initers(void)
             InitializeCriticalSection(&barrier_init);
             InitializeCriticalSection(&thread_id_list_lock);
 
-            /* fatal errors are handled by the routine by calling
-               tMPI_Fatal_error() */
-            tMPI_Init_NUMA();
+            ret = tMPI_Init_NUMA();
+            if (ret != 0)
+            {
+                goto err;
+            }
+
 
-            tMPI_Thread_id_list_init();
+            ret = tMPI_Thread_id_list_init();
+            if (ret != 0)
+            {
+                goto err;
+            }
 
             tMPI_Atomic_memory_barrier_rel();
             tMPI_Atomic_set(&init_inited, 1);
@@ -648,22 +666,10 @@ static void tMPI_Init_initers(void)
 
         tMPI_Spinlock_unlock( &init_init );
     }
-}
-
-
-
-/* TODO: this needs to go away!  (there's another one in pthreads.c)
-   fatal errors are thankfully really rare*/
-void tMPI_Fatal_error(const char *file, int line, const char *message, ...)
-{
-    va_list ap;
-
-    fprintf(stderr, "tMPI Fatal error in %s, line %d: ", file, line);
-    va_start(ap, message);
-    vfprintf(stderr, message, ap);
-    va_end(ap);
-    fprintf(stderr, "\n");
-    abort();
+    return ret;
+err:
+    tMPI_Spinlock_unlock( &init_init );
+    return ret;
 }
 
 
@@ -709,23 +715,37 @@ int tMPI_Thread_create(tMPI_Thread_t *thread,
 {
     DWORD thread_id;
     struct tMPI_Thread_starter_param *prm;
+    int   ret;
+
+    ret = tMPI_Init_initers();
+    if (ret != 0)
+    {
+        return ret;
+    }
 
-    tMPI_Init_initers();
+    if (thread == NULL)
+    {
+        return EINVAL;
+    }
 
     /* a small memory leak to be sure that it doesn't get deallocated
        once this function ends, before the newly created thread uses it. */
     prm = (struct tMPI_Thread_starter_param*)
-        tMPI_Malloc(sizeof(struct tMPI_Thread_starter_param));
+        malloc(sizeof(struct tMPI_Thread_starter_param));
+    if (prm == NULL)
+    {
+        return ENOMEM;
+    }
     prm->start_routine = start_routine;
     prm->param         = arg;
 
-    *thread = (struct tMPI_Thread*)tMPI_Malloc(sizeof(struct tMPI_Thread)*1);
-
-    if (thread == NULL)
+    *thread = (struct tMPI_Thread*)malloc(sizeof(struct tMPI_Thread)*1);
+    if (*thread == NULL)
     {
-        tMPI_Fatal_error(TMPI_FARGS, "Invalid thread pointer.");
-        return EINVAL;
+        free(prm);
+        return ENOMEM;
     }
+
     /* this must be locked before the thread is created to prevent a race
        condition if the thread immediately wants to create its own entry */
     EnterCriticalSection( &thread_id_list_lock );
@@ -737,25 +757,39 @@ int tMPI_Thread_create(tMPI_Thread_t *thread,
                                               prm,
                                               0,
                                               &thread_id);
+    if ((*thread)->th == NULL)
+    {
+        ret = -1;
+        goto err;
+    }
     (*thread)->id = thread_id;
 
     if ((*thread)->th == NULL)
     {
-        tMPI_Free(thread);
-        tMPI_Fatal_error(TMPI_FARGS, "Failed to create thread, error code=%d",
-                         GetLastError());
-        return -1;
+        ret = -1;
+        goto err;
+    }
+    ret = tMPI_Thread_id_list_add_locked(thread_id, (*thread));
+    if (ret != 0)
+    {
+        goto err;
     }
-    tMPI_Thread_id_list_add_locked(thread_id, (*thread));
     LeaveCriticalSection( &thread_id_list_lock );
 
+#if 0
     /* inherit the thread priority from the parent thread. */
     /* TODO: is there value in setting this, vs. just allowing it to default
        from the process?  currently, this limits the effectivenes of changing
        the priority in eg: TaskManager. */
     SetThreadPriority(((*thread)->th), GetThreadPriority(GetCurrentThread()));
+#endif
 
     return 0;
+err:
+    free(prm);
+    free(thread);
+    LeaveCriticalSection( &thread_id_list_lock );
+    return ret;
 }
 
 
@@ -769,11 +803,8 @@ int tMPI_Thread_join(tMPI_Thread_t thread, void **value_ptr)
     DWORD ret, retval;
 
     ret = WaitForSingleObject(thread->th, INFINITE);
-
     if (ret != 0)
     {
-        tMPI_Fatal_error(TMPI_FARGS, "Failed to join thread. error code=%d",
-                         GetLastError());
         return -1;
     }
 
@@ -781,16 +812,12 @@ int tMPI_Thread_join(tMPI_Thread_t thread, void **value_ptr)
     {
         if (!GetExitCodeThread(thread, &retval))
         {
-            /* TODO: somehow assign value_ptr */
-            tMPI_Fatal_error(TMPI_FARGS,
-                             "Failed to get thread exit code: error=%d",
-                             GetLastError());
             return -1;
         }
     }
     CloseHandle(thread->th);
     tMPI_Thread_id_list_remove(thread->id);
-    tMPI_Free(thread);
+    free(thread);
 
     return 0;
 }
@@ -798,7 +825,6 @@ int tMPI_Thread_join(tMPI_Thread_t thread, void **value_ptr)
 
 void tMPI_Thread_exit(void *value_ptr)
 {
-    /* TODO: fix exit code */
     /* TODO: call destructors for thread-local storage */
     ExitThread( 0 );
 }
@@ -810,8 +836,6 @@ int tMPI_Thread_cancel(tMPI_Thread_t thread)
 {
     if (!TerminateThread( thread, -1) )
     {
-        tMPI_Fatal_error(TMPI_FARGS, "Failed thread_cancel, error code=%d",
-                         GetLastError());
         return -1;
     }
     tMPI_Thread_id_list_remove(thread->id);
@@ -822,10 +846,15 @@ int tMPI_Thread_cancel(tMPI_Thread_t thread)
 tMPI_Thread_t tMPI_Thread_self(void)
 {
     tMPI_Thread_t th;
-    tMPI_Init_initers();
+    int           ret;
 
-    th = tMPI_Thread_id_list_add_self();
+    ret = tMPI_Init_initers();
+    if (ret != 0)
+    {
+        return NULL;
+    }
 
+    th = tMPI_Thread_id_list_add_self();
     return th;
 }
 
@@ -914,7 +943,11 @@ int tMPI_Thread_mutex_init(tMPI_Thread_mutex_t *mtx)
         return EINVAL;
     }
 
-    mtx->mutex = (struct tMPI_Mutex*)tMPI_Malloc(sizeof(struct tMPI_Mutex)*1);
+    mtx->mutex = (struct tMPI_Mutex*)malloc(sizeof(struct tMPI_Mutex)*1);
+    if (mtx->mutex == NULL)
+    {
+        return ENOMEM;
+    }
     InitializeCriticalSection(&(mtx->mutex->cs));
 
     return 0;
@@ -929,7 +962,7 @@ int tMPI_Thread_mutex_destroy(tMPI_Thread_mutex_t *mtx)
     }
 
     DeleteCriticalSection(&(mtx->mutex->cs));
-    tMPI_Free(mtx->mutex);
+    free(mtx->mutex);
 
     return 0;
 }
@@ -949,7 +982,11 @@ static int tMPI_Thread_mutex_init_once(tMPI_Thread_mutex_t *mtx)
      */
 
     /* initialize the initializers */
-    tMPI_Init_initers();
+    ret = tMPI_Init_initers();
+    if (ret != 0)
+    {
+        return ret;
+    }
     /* Lock the common one-time init mutex so we can check carefully */
     EnterCriticalSection( &mutex_init );
 
@@ -1017,22 +1054,21 @@ int tMPI_Thread_key_create(tMPI_Thread_key_t *key, void (*destructor)(void *))
 {
     if (key == NULL)
     {
-        tMPI_Fatal_error(TMPI_FARGS, "Invalid key pointer.");
         return EINVAL;
     }
 
 
     /* TODO: make list of destructors for thread-local storage */
-    key->key = (struct tMPI_Thread_key*)tMPI_Malloc(sizeof(struct
-                                                           tMPI_Thread_key)*1);
+    key->key = (struct tMPI_Thread_key*)malloc(sizeof(struct tMPI_Thread_key));
+    if (key->key == NULL)
+    {
+        return ENOMEM;
+    }
 
     (key)->key->wkey = TlsAlloc();
 
     if ( (key)->key->wkey == TLS_OUT_OF_INDEXES)
     {
-        tMPI_Fatal_error(TMPI_FARGS,
-                         "Failed to create thread key, error code=%d.",
-                         GetLastError());
         return -1;
     }
 
@@ -1043,7 +1079,7 @@ int tMPI_Thread_key_create(tMPI_Thread_key_t *key, void (*destructor)(void *))
 int tMPI_Thread_key_delete(tMPI_Thread_key_t key)
 {
     TlsFree(key.key->wkey);
-    tMPI_Free(key.key);
+    free(key.key);
 
     return 0;
 }
@@ -1098,12 +1134,18 @@ int tMPI_Thread_once(tMPI_Thread_once_t *once_control,
 
     if (!bStatus)
     {
-        tMPI_Fatal_error(TMPI_FARGS, "Failed to run thread_once routine");
         return -1;
     }
 #else
+    int ret;
+
     /* really ugly hack - and it's slow... */
-    tMPI_Init_initers();
+    ret = tMPI_Init_initers();
+    if (ret != 0)
+    {
+        return ret;
+    }
+
     EnterCriticalSection(&once_init);
     if (tMPI_Atomic_get(&(once_control->once)) == 0)
     {
@@ -1127,7 +1169,11 @@ int tMPI_Thread_cond_init(tMPI_Thread_cond_t *cond)
     }
 
     cond->condp = (struct tMPI_Thread_cond*)
-        tMPI_Malloc(sizeof(struct tMPI_Thread_cond)*1);
+        malloc(sizeof(struct tMPI_Thread_cond));
+    if (cond->condp == NULL)
+    {
+        return ENOMEM;
+    }
 #if 0
     /* use this code once Vista is the minimum version required */
     InitializeConditionVariable( &(cond->cv) );
@@ -1150,7 +1196,7 @@ int tMPI_Thread_cond_destroy(tMPI_Thread_cond_t *cond)
     /* windows doesnt have this function */
 #else
     DeleteCriticalSection(&(cond->condp->wtr_lock));
-    tMPI_Free(cond->condp);
+    free(cond->condp);
 #endif
     return 0;
 }
@@ -1180,7 +1226,11 @@ static int tMPI_Thread_cond_init_once(tMPI_Thread_cond_t *cond)
      */
 
     /* initialize the initializers */
-    tMPI_Init_initers();
+    ret = tMPI_Init_initers();
+    if (ret != 0)
+    {
+        return ret;
+    }
     /* Lock the common one-time init mutex so we can check carefully */
     EnterCriticalSection( &cond_init );
 
@@ -1204,11 +1254,16 @@ int tMPI_Thread_cond_wait(tMPI_Thread_cond_t *cond, tMPI_Thread_mutex_t *mtx)
     BOOL wait_done   = FALSE;
     BOOL last_waiter = FALSE;
     int  my_cycle;
+    int  ret;
 
     /* check whether the condition is initialized */
     if (tMPI_Atomic_get( &(cond->initialized)  ) == 0)
     {
-        tMPI_Thread_cond_init_once(cond);
+        ret = tMPI_Thread_cond_init_once(cond);
+        if (ret != 0)
+        {
+            return ret;
+        }
     }
     /* the mutex must have been initialized because it should be locked here */
 
@@ -1218,8 +1273,6 @@ int tMPI_Thread_cond_wait(tMPI_Thread_cond_t *cond, tMPI_Thread_mutex_t *mtx)
 
     if (!ret)
     {
-        tMPI_Fatal_error(TMPI_FARGS, "Failed wait for condition, error code=%d",
-                         GetLastError());
         return -1;
     }
 #else
@@ -1240,8 +1293,6 @@ int tMPI_Thread_cond_wait(tMPI_Thread_cond_t *cond, tMPI_Thread_mutex_t *mtx)
         /* do the actual waiting */
         if (WaitForSingleObject( cond->condp->ev, INFINITE ) == WAIT_FAILED)
         {
-            tMPI_Fatal_error(TMPI_FARGS, "Failed event reset, error code=%d",
-                             GetLastError());
             return -1;
         }
 
@@ -1268,8 +1319,6 @@ int tMPI_Thread_cond_wait(tMPI_Thread_cond_t *cond, tMPI_Thread_mutex_t *mtx)
     {
         if (!ResetEvent( cond->condp->ev ))
         {
-            tMPI_Fatal_error(TMPI_FARGS, "Failed event reset, error code=%d",
-                             GetLastError());
             return -1;
         }
     }
@@ -1283,10 +1332,15 @@ int tMPI_Thread_cond_wait(tMPI_Thread_cond_t *cond, tMPI_Thread_mutex_t *mtx)
 
 int tMPI_Thread_cond_signal(tMPI_Thread_cond_t *cond)
 {
+    int ret;
     /* check whether the condition is initialized */
     if (tMPI_Atomic_get( &(cond->initialized)  ) == 0)
     {
-        tMPI_Thread_cond_init_once(cond);
+        ret = tMPI_Thread_cond_init_once(cond);
+        if (ret != 0)
+        {
+            return ret;
+        }
     }
     /* The condition variable is now guaranteed to be valid. */
 #if 0
@@ -1302,8 +1356,6 @@ int tMPI_Thread_cond_signal(tMPI_Thread_cond_t *cond)
         if (!SetEvent(cond->condp->ev)) /* actually release the
                                            waiting threads */
         {
-            tMPI_Fatal_error(TMPI_FARGS, "Failed SetEvent, error code=%d",
-                             GetLastError());
             return -1;
         }
     }
@@ -1317,10 +1369,16 @@ int tMPI_Thread_cond_signal(tMPI_Thread_cond_t *cond)
 
 int tMPI_Thread_cond_broadcast(tMPI_Thread_cond_t *cond)
 {
+    int ret;
     /* check whether the condition is initialized */
     if (tMPI_Atomic_get( &(cond->initialized)  ) == 0)
     {
-        tMPI_Thread_cond_init_once(cond);
+        ret = tMPI_Thread_cond_init_once(cond);
+        if (ret != 0)
+        {
+            return ret;
+        }
+
     }
     /* The condition variable is now guaranteed to be valid. */
 #if 0
@@ -1336,8 +1394,6 @@ int tMPI_Thread_cond_broadcast(tMPI_Thread_cond_t *cond)
         if (!SetEvent(cond->condp->ev)) /* actually release the
                                            waiting threads */
         {
-            tMPI_Fatal_error(TMPI_FARGS, "Failed SetEvent, error code=%d",
-                             GetLastError());
             return -1;
         }
     }
@@ -1351,21 +1407,35 @@ int tMPI_Thread_cond_broadcast(tMPI_Thread_cond_t *cond)
 
 int tMPI_Thread_barrier_init(tMPI_Thread_barrier_t *barrier, int n)
 {
+    int ret;
+
     if (barrier == NULL)
     {
         return EINVAL;
     }
 
     barrier->barrierp = (struct tMPI_Thread_barrier*)
-        tMPI_Malloc(sizeof(struct tMPI_Thread_barrier)*1);
+        malloc(sizeof(struct tMPI_Thread_barrier)*1);
+    if (barrier->barrierp == NULL)
+    {
+        return ENOMEM;
+    }
 
 #if 0
     /* use this once Vista is the oldest supported windows version: */
     InitializeCriticalSection(&(barrier->barrierp->cs));
     InitializeConditionVariable(&(barrier->barrierp->cv));
 #else
-    tMPI_Thread_mutex_init(&(barrier->barrierp->cs));
-    tMPI_Thread_cond_init(&(barrier->barrierp->cv));
+    ret = tMPI_Thread_mutex_init(&(barrier->barrierp->cs));
+    if (ret != 0)
+    {
+        return ret;
+    }
+    ret = tMPI_Thread_cond_init(&(barrier->barrierp->cv));
+    if (ret != 0)
+    {
+        return ret;
+    }
 #endif
 
     barrier->threshold = n;
@@ -1379,6 +1449,8 @@ int tMPI_Thread_barrier_init(tMPI_Thread_barrier_t *barrier, int n)
 
 int tMPI_Thread_barrier_destroy(tMPI_Thread_barrier_t *barrier)
 {
+    int ret;
+
     if (barrier == NULL)
     {
         return EINVAL;
@@ -1387,12 +1459,20 @@ int tMPI_Thread_barrier_destroy(tMPI_Thread_barrier_t *barrier)
 #if 0
     DeleteCriticalSection(&(barrier->barrierp->cs));
 #else
-    tMPI_Thread_mutex_destroy(&(barrier->barrierp->cs));
+    ret = tMPI_Thread_mutex_destroy(&(barrier->barrierp->cs));
+    if (ret != 0)
+    {
+        return ret;
+    }
 #endif
 
-    tMPI_Thread_cond_destroy(&(barrier->barrierp->cv));
+    ret = tMPI_Thread_cond_destroy(&(barrier->barrierp->cv));
+    if (ret != 0)
+    {
+        return ret;
+    }
 
-    tMPI_Free(barrier->barrierp);
+    free(barrier->barrierp);
 
     return 0;
 }
@@ -1424,7 +1504,11 @@ static int tMPI_Thread_barrier_init_once(tMPI_Thread_barrier_t *barrier, int n)
 
 
     /* initialize the initializers */
-    tMPI_Init_initers();
+    ret = tMPI_Init_initers();
+    if (ret != 0)
+    {
+        return ret;
+    }
 
     /* Lock the common one-time init mutex so we can check carefully */
     EnterCriticalSection( &barrier_init );
@@ -1445,21 +1529,28 @@ static int tMPI_Thread_barrier_init_once(tMPI_Thread_barrier_t *barrier, int n)
 
 int tMPI_Thread_barrier_wait(tMPI_Thread_barrier_t *barrier)
 {
-    int     cycle;
-    BOOL    rc  = FALSE;
-    int     ret = 0;
+    int  cycle;
+    BOOL rc  = FALSE;
+    int  ret = 0;
     /*tMPI_Thread_pthread_barrier_t *p;*/
 
     /* check whether the barrier is initialized */
     if (tMPI_Atomic_get( &(barrier->initialized)  ) == 0)
     {
-        tMPI_Thread_barrier_init_once(barrier, barrier->threshold);
+        ret = tMPI_Thread_barrier_init_once(barrier, barrier->threshold);
+        if (ret != 0)
+        {
+            return ret;
+        }
     }
-
 #if 0
     EnterCriticalSection( &(barrier->barrierp->cs)  );
 #else
-    tMPI_Thread_mutex_lock( &(barrier->barrierp->cs) );
+    ret = tMPI_Thread_mutex_lock( &(barrier->barrierp->cs) );
+    if (ret != 0)
+    {
+        return ret;
+    }
 #endif
 
 
@@ -1476,7 +1567,11 @@ int tMPI_Thread_barrier_wait(tMPI_Thread_barrier_t *barrier)
 #if 0
         WakeAllConditionVariable( &(barrier->barrierp->cv) );
 #else
-        tMPI_Thread_cond_broadcast( &(barrier->barrierp->cv) );
+        ret = tMPI_Thread_cond_broadcast( &(barrier->barrierp->cv) );
+        if (ret != 0)
+        {
+            return ret;
+        }
 #endif
     }
     else
index 7d283f7df732a2a19f153528d089b45c1b0df15d..9cf04d86def7e006086eeacc5627dcce29f3a3b7 100644 (file)
@@ -173,14 +173,13 @@ static const t_ftupd ftupd[] = {
     { 46, F_ECONSERVED        },
     { 69, F_VTEMP_NOLONGERUSED},
     { 66, F_PDISPCORR         },
-    { 54, F_DHDL_CON          },
+    { 54, F_DVDL_CONSTR       },
     { 76, F_ANHARM_POL        },
     { 79, F_DVDL_COUL         },
     { 79, F_DVDL_VDW,         },
     { 79, F_DVDL_BONDED,      },
     { 79, F_DVDL_RESTRAINT    },
     { 79, F_DVDL_TEMPERATURE  },
-    { 54, F_DHDL_CON          }
 };
 #define NFTUPD asize(ftupd)
 
index 323309366d8fa49f5ff71197bd9e896804d59313..641e7dc5a160fc8eff8bb481936afe591ab1719a 100644 (file)
@@ -1076,6 +1076,17 @@ void check_ir(const char *mdparin, t_inputrec *ir, t_gromppopts *opts,
         warning_note(wi, warn_buf);
     }
 
+    if (ir->coulombtype == eelPMESWITCH)
+    {
+        if (ir->rcoulomb_switch/ir->rcoulomb < 0.9499)
+        {
+            sprintf(warn_buf, "The switching range for %s should be 5%% or less, energy conservation will be good anyhow, since ewald_rtol = %g",
+                    eel_names[ir->coulombtype],
+                    ir->ewald_rtol);
+            warning(wi, warn_buf);
+        }
+    }
+
     if (EEL_FULL(ir->coulombtype))
     {
         if (ir->coulombtype == eelPMESWITCH || ir->coulombtype == eelPMEUSER ||
@@ -1142,6 +1153,16 @@ void check_ir(const char *mdparin, t_inputrec *ir, t_gromppopts *opts,
     }
     if (ir->cutoff_scheme == ecutsGROUP)
     {
+        if (((ir->coulomb_modifier != eintmodNONE && ir->rcoulomb == ir->rlist) ||
+             (ir->vdw_modifier != eintmodNONE && ir->rvdw == ir->rlist)) &&
+            ir->nstlist != 1)
+        {
+            warning_note(wi, "With exact cut-offs, rlist should be "
+                         "larger than rcoulomb and rvdw, so that there "
+                         "is a buffer region for particle motion "
+                         "between neighborsearch steps");
+        }
+
         if (EEL_IS_ZERO_AT_CUTOFF(ir->coulombtype)
             && (ir->rlistlong <= ir->rcoulomb))
         {
@@ -3821,7 +3842,10 @@ void check_chargegroup_radii(const gmx_mtop_t *mtop, const t_inputrec *ir,
         if (rvdw1  + rvdw2  > ir->rlist ||
             rcoul1 + rcoul2 > ir->rlist)
         {
-            sprintf(warn_buf, "The sum of the two largest charge group radii (%f) is larger than rlist (%f)\n", max(rvdw1+rvdw2, rcoul1+rcoul2), ir->rlist);
+            sprintf(warn_buf,
+                    "The sum of the two largest charge group radii (%f) "
+                    "is larger than rlist (%f)\n",
+                    max(rvdw1+rvdw2, rcoul1+rcoul2), ir->rlist);
             warning(wi, warn_buf);
         }
         else
@@ -3830,13 +3854,19 @@ void check_chargegroup_radii(const gmx_mtop_t *mtop, const t_inputrec *ir,
              * since user defined interactions might purposely
              * not be zero at the cut-off.
              */
-            if (EVDW_IS_ZERO_AT_CUTOFF(ir->vdwtype) &&
+            if ((EVDW_IS_ZERO_AT_CUTOFF(ir->vdwtype) ||
+                 ir->vdw_modifier != eintmodNONE) &&
                 rvdw1 + rvdw2 > ir->rlistlong - ir->rvdw)
             {
-                sprintf(warn_buf, "The sum of the two largest charge group radii (%f) is larger than %s (%f) - rvdw (%f)\n",
+                sprintf(warn_buf, "The sum of the two largest charge group "
+                        "radii (%f) is larger than %s (%f) - rvdw (%f).\n"
+                        "With exact cut-offs, better performance can be "
+                        "obtained with cutoff-scheme = %s, because it "
+                        "does not use charge groups at all.",
                         rvdw1+rvdw2,
                         ir->rlistlong > ir->rlist ? "rlistlong" : "rlist",
-                        ir->rlistlong, ir->rvdw);
+                        ir->rlistlong, ir->rvdw,
+                        ecutscheme_names[ecutsVERLET]);
                 if (ir_NVE(ir))
                 {
                     warning(wi, warn_buf);
@@ -3846,13 +3876,16 @@ void check_chargegroup_radii(const gmx_mtop_t *mtop, const t_inputrec *ir,
                     warning_note(wi, warn_buf);
                 }
             }
-            if (EEL_IS_ZERO_AT_CUTOFF(ir->coulombtype) &&
+            if ((EEL_IS_ZERO_AT_CUTOFF(ir->coulombtype) ||
+                 ir->coulomb_modifier != eintmodNONE) &&
                 rcoul1 + rcoul2 > ir->rlistlong - ir->rcoulomb)
             {
-                sprintf(warn_buf, "The sum of the two largest charge group radii (%f) is larger than %s (%f) - rcoulomb (%f)\n",
+                sprintf(warn_buf, "The sum of the two largest charge group radii (%f) is larger than %s (%f) - rcoulomb (%f).\n"
+                        "With exact cut-offs, better performance can be obtained with cutoff-scheme = %s, because it does not use charge groups at all.",
                         rcoul1+rcoul2,
                         ir->rlistlong > ir->rlist ? "rlistlong" : "rlist",
-                        ir->rlistlong, ir->rcoulomb);
+                        ir->rlistlong, ir->rcoulomb,
+                        ecutscheme_names[ecutsVERLET]);
                 if (ir_NVE(ir))
                 {
                     warning(wi, warn_buf);
index e317d9ee6e1f04bd1721179f6b62aee9ee758592..fdbf854dc4a697cab9a7be52d59bbfdd73abbfc2 100644 (file)
@@ -39,6 +39,8 @@ enum gmx_cpuid_vendor
     GMX_CPUID_VENDOR_UNKNOWN,
     GMX_CPUID_VENDOR_INTEL,
     GMX_CPUID_VENDOR_AMD,
+    GMX_CPUID_VENDOR_FUJITSU,
+    GMX_CPUID_VENDOR_IBM,
     GMX_CPUID_NVENDORS
 };
 
@@ -112,6 +114,7 @@ enum gmx_cpuid_acceleration
     GMX_CPUID_ACCELERATION_X86_SSE4_1,
     GMX_CPUID_ACCELERATION_X86_AVX_128_FMA,
     GMX_CPUID_ACCELERATION_X86_AVX_256,
+    GMX_CPUID_ACCELERATION_SPARC64_HPC_ACE,
     GMX_CPUID_NACCELERATIONS
 };
 
index 6ac5fff78c6ff146950789e25402b8c686541db8..b6d8d9a333c21c7a909285f95697866b12a2599d 100644 (file)
@@ -86,25 +86,22 @@ extern "C"
 } /* Avoids screwing up auto-indentation */
 #endif
 
+/* first check for gcc/icc platforms.
+   Some compatible compilers, like icc on linux+mac will take this path,
+   too */
+#if ( (defined(__GNUC__) || defined(__PATHSCALE__) || defined(__PGI)) && \
+    (!defined(__xlc__)) && (!defined(TMPI_TEST_NO_ATOMICS)) )
+
 #ifdef __GNUC__
 #define TMPI_GCC_VERSION (__GNUC__ * 10000 \
                           + __GNUC_MINOR__ * 100 \
                           + __GNUC_PATCHLEVEL__)
 #endif
 
-
-/* first check for gcc/icc platforms.
-   Some compatible compilers, like icc on linux+mac will take this path,
-   too */
-#if ( (defined(__GNUC__) || defined(__PATHSCALE__) || defined(__PGI)) && (!defined(__xlc__)) )
-
-
-
 /* now check specifically for several architectures: */
 #if ((defined(i386) || defined(__x86_64__)) && !defined(__OPEN64__))
 /* first x86: */
 #include "atomic/gcc_x86.h"
-/*#include "atomic/gcc.h"*/
 
 #elif (defined(__ia64__))
 /* then ia64: */
@@ -114,6 +111,11 @@ extern "C"
 /*#elif (defined(__powerpc__) || (defined(__ppc__)) )*/
 /*#include "atomic/gcc_ppc.h"*/
 
+#elif defined(__FUJITSU) && ( defined(__sparc_v9__) || defined (__sparcv9) )
+
+/* Fujitsu FX10 SPARC compiler */
+#include "atomic/fujitsu_sparc.h"
+
 #else
 /* otherwise, there's a generic gcc intrinsics version: */
 #include "atomic/gcc.h"
@@ -121,28 +123,36 @@ extern "C"
 #endif /* end of check for gcc specific architectures */
 
 /* not gcc: */
-#elif (defined(_MSC_VER) && (_MSC_VER >= 1200))
+#elif (defined(_MSC_VER) && (_MSC_VER >= 1200) && \
+    (!defined(TMPI_TEST_NO_ATOMICS)) )
+
 /* Microsoft Visual C on x86, define taken from FFTW who got it from
    Morten Nissov. icc on windows will take this path.  */
 #include "atomic/msvc.h"
 
 #elif ( (defined(__IBM_GCC_ASM) || defined(__IBM_STDCPP_ASM))  && \
-    (defined(__powerpc__) || defined(__ppc__)))
+    (defined(__powerpc__) || defined(__ppc__)) && \
+    (!defined(TMPI_TEST_NO_ATOMICS)) )
 
 /* PowerPC using xlC intrinsics.  */
 
 #include "atomic/xlc_ppc.h"
 
-#elif defined(__xlC__)  || defined(__xlc__)
+#elif ( ( defined(__xlC__)  || defined(__xlc__) ) && \
+    (!defined(TMPI_TEST_NO_ATOMICS)) )
 /* IBM xlC compiler */
 #include "atomic/xlc_ppc.h"
 
 
-#elif defined (__sun) && (defined(__sparcv9) || defined(__sparc))
+#elif (defined (__sun) && (defined(__sparcv9) || defined(__sparc)) && \
+    (!defined(TMPI_TEST_NO_ATOMICS)) )
 /* Solaris on SPARC (Sun C Compiler, Solaris Studio) */
 #include "atomic/suncc-sparc.h"
 
+#elif defined(__FUJITSU) && defined(__sparc__)
 
+/* Fujitsu FX10 SPARC compiler requires gcc compatibility with -Xg */
+#error Atomics support for Fujitsu FX10 compiler requires -Xg (gcc compatibility)
 
 
 #else
@@ -152,9 +162,10 @@ extern "C"
 #error No atomic operations implemented for this cpu/compiler combination.
 #endif
 
+#ifndef DOXYGEN
 /** Indicates that no support for atomic operations is present. */
 #define TMPI_NO_ATOMICS
-
+#endif
 
 /** Memory barrier operation
 
@@ -195,11 +206,10 @@ extern "C"
  */
 #define tMPI_Atomic_memory_barrier_rel()
 
-
-
-
-/** System mutex used for locking to guarantee atomicity */
-static tMPI_Thread_mutex_t tMPI_Atomic_mutex = TMPI_THREAD_MUTEX_INITIALIZER;
+#ifndef DOXYGEN
+/* signal that they exist */
+#define TMPI_HAVE_ACQ_REL_BARRIERS
+#endif
 
 /** Atomic operations datatype
  *
@@ -248,6 +258,7 @@ static tMPI_Thread_mutex_t tMPI_Atomic_mutex = TMPI_THREAD_MUTEX_INITIALIZER;
  *  - PowerPC, using GNU compilers
  *  - PowerPC, using IBM AIX compilers
  *  - PowerPC, using IBM compilers >=7.0 under Linux or Mac OS X.
+ *  - Sparc64, using Fujitsu compilers.
  *
  * \see
  * - tMPI_Atomic_get
@@ -258,7 +269,7 @@ static tMPI_Thread_mutex_t tMPI_Atomic_mutex = TMPI_THREAD_MUTEX_INITIALIZER;
  */
 typedef struct tMPI_Atomic
 {
-    int value;  /**< The atomic value. */
+    int value; /**< The atomic value.*/
 }
 tMPI_Atomic_t;
 
@@ -274,7 +285,7 @@ tMPI_Atomic_t;
  */
 typedef struct tMPI_Atomic_ptr
 {
-    void* value;  /**< The atomic pointer value. */
+    void *value; /**< The atomic pointer. */
 }
 tMPI_Atomic_ptr_t;
 
@@ -298,13 +309,7 @@ tMPI_Atomic_ptr_t;
  * - tMPI_Spinlock_trylock
  * - tMPI_Spinlock_wait
  */
-typedef struct
-{
-#ifndef DOXYGEN
-    tMPI_Thread_mutex_t lock; /* we don't want this documented */
-#endif
-} tMPI_Spinlock_t;
-/*#define tMPI_Spinlock_t     tMPI_Thread_mutex_t*/
+typedef struct tMPI_Spinlock *tMPI_Spinlock_t;
 
 /*! \def TMPI_SPINLOCK_INITIALIZER
  * \brief Spinlock static initializer
@@ -318,7 +323,7 @@ typedef struct
  *
  *  \hideinitializer
  */
-#  define TMPI_SPINLOCK_INITIALIZER   { TMPI_THREAD_MUTEX_INITIALIZER }
+#define TMPI_SPINLOCK_INITIALIZER   { NULL }
 
 /* Since mutexes guarantee memory barriers this works fine */
 /** Return value of an atomic integer
@@ -331,11 +336,8 @@ typedef struct
  *
  *  \hideinitializer
  */
-#ifdef DOXYGEN
-static inline int tMPI_Atomic_get(tMPI_Atomic_t &a);
-#else
-#define tMPI_Atomic_get(a)   ((a)->value)
-#endif
+TMPI_EXPORT
+int tMPI_Atomic_get(const tMPI_Atomic_t *a);
 
 /** Write value to an atomic integer
  *
@@ -348,13 +350,7 @@ static inline int tMPI_Atomic_get(tMPI_Atomic_t &a);
  *  \hideinitializer
  */
 TMPI_EXPORT
-static inline void tMPI_Atomic_set(tMPI_Atomic_t *a, int i)
-{
-    /* Mutexes here are necessary to guarantee memory visibility */
-    tMPI_Thread_mutex_lock(&tMPI_Atomic_mutex);
-    a->value = i;
-    tMPI_Thread_mutex_unlock(&tMPI_Atomic_mutex);
-}
+void tMPI_Atomic_set(tMPI_Atomic_t *a, int i);
 
 
 /** Return value of an atomic pointer
@@ -367,11 +363,8 @@ static inline void tMPI_Atomic_set(tMPI_Atomic_t *a, int i)
  *
  *  \hideinitializer
  */
-#ifdef DOXYGEN
-static inline void* tMPI_Atomic_ptr_get(tMPI_Atomic_ptr_t &a);
-#else
-#define tMPI_Atomic_ptr_get(a)   ((a)->value)
-#endif
+TMPI_EXPORT
+void* tMPI_Atomic_ptr_get(const tMPI_Atomic_ptr_t *a);
 
 
 
@@ -387,14 +380,7 @@ static inline void* tMPI_Atomic_ptr_get(tMPI_Atomic_ptr_t &a);
  *  \hideinitializer
  */
 TMPI_EXPORT
-static inline void tMPI_Atomic_ptr_set(tMPI_Atomic_t *a, void *p)
-{
-    /* Mutexes here are necessary to guarantee memory visibility */
-    tMPI_Thread_mutex_lock(&tMPI_Atomic_mutex);
-    a->value = (void*)p;
-    tMPI_Thread_mutex_unlock(&tMPI_Atomic_mutex);
-}
-
+void tMPI_Atomic_ptr_set(tMPI_Atomic_ptr_t *a, void *p);
 
 /** Add integer to atomic variable
  *
@@ -407,15 +393,10 @@ static inline void tMPI_Atomic_ptr_set(tMPI_Atomic_t *a, void *p)
  *  \return The new value (after summation).
  */
 TMPI_EXPORT
-static inline int tMPI_Atomic_add_return(tMPI_Atomic_t *a, int i)
-{
-    int t;
-    tMPI_Thread_mutex_lock(&tMPI_Atomic_mutex);
-    t        = a->value + i;
-    a->value = t;
-    tMPI_Thread_mutex_unlock(&tMPI_Atomic_mutex);
-    return t;
-}
+int tMPI_Atomic_add_return(tMPI_Atomic_t *a, int i);
+#ifndef DOXYGEN
+#define TMPI_ATOMIC_HAVE_NATIVE_ADD_RETURN
+#endif
 
 
 
@@ -435,16 +416,10 @@ static inline int tMPI_Atomic_add_return(tMPI_Atomic_t *a, int i)
  *  \return    The value of the atomic variable before addition.
  */
 TMPI_EXPORT
-static inline int tMPI_Atomic_fetch_add(tMPI_Atomic_t *a, int i)
-{
-    int old_value;
-
-    tMPI_Thread_mutex_lock(&tMPI_Atomic_mutex);
-    old_value  = a->value;
-    a->value   = old_value + i;
-    tMPI_Thread_mutex_unlock(&tMPI_Atomic_mutex);
-    return old_value;
-}
+int tMPI_Atomic_fetch_add(tMPI_Atomic_t *a, int i);
+#ifndef DOXYGEN
+#define TMPI_ATOMIC_HAVE_NATIVE_FETCH_ADD
+#endif
 
 
 
@@ -477,19 +452,7 @@ static inline int tMPI_Atomic_fetch_add(tMPI_Atomic_t *a, int i)
  *   \note   The exchange occured if the return value is identical to \a old.
  */
 TMPI_EXPORT
-static inline int tMPI_Atomic_cas(tMPI_Atomic_t *a, int old_val, int new_val)
-{
-    int t = 0;
-
-    tMPI_Thread_mutex_lock(&tMPI_Atomic_mutex);
-    if (a->value == old_val)
-    {
-        a->value = new_val;
-        t        = 1;
-    }
-    tMPI_Thread_mutex_unlock(&tMPI_Atomic_mutex);
-    return t;
-}
+int tMPI_Atomic_cas(tMPI_Atomic_t *a, int old_val, int new_val);
 
 
 
@@ -515,20 +478,38 @@ static inline int tMPI_Atomic_cas(tMPI_Atomic_t *a, int old_val, int new_val)
  *   \note   The exchange occured if the return value is identical to \a old.
  */
 TMPI_EXPORT
-static inline int tMPI_Atomic_ptr_cas(tMPI_Atomic_ptr_t * a, void *old_val,
-                                      void *new_val)
-{
-    int t = 0;
-
-    tMPI_Thread_mutex_lock(&tMPI_Atomic_mutex);
-    if (a->value == old_val)
-    {
-        a->value = new_val;
-        t        = 1;
-    }
-    tMPI_Thread_mutex_unlock(&tMPI_Atomic_mutex);
-    return t;
-}
+int tMPI_Atomic_ptr_cas(tMPI_Atomic_ptr_t * a, void *old_val,
+                        void *new_val);
+
+/** Atomic swap operation.
+
+   Atomically swaps the data in the tMPI_Atomic_t operand with the value of b.
+   Note: This has no good assembly counterparts on many architectures, so
+         it might not be faster than a repreated CAS.
+
+   \param a  Pointer to atomic type
+   \param b  Value to swap
+   \return the original value of a
+ */
+TMPI_EXPORT
+int tMPI_Atomic_swap(tMPI_Atomic_t *a, int b);
+
+/** Atomic swap pointer operation.
+
+   Atomically swaps the pointer in the tMPI_Atomic_ptr_t operand with the
+   value of b.
+   Note: This has no good assembly counterparts on many architectures, so
+         it might not be faster than a repreated CAS.
+
+   \param a  Pointer to atomic type
+   \param b  Value to swap
+   \return the original value of a
+ */
+TMPI_EXPORT
+void *tMPI_Atomic_ptr_swap(tMPI_Atomic_ptr_t *a, void *b);
+#ifndef DOXYGEN
+#define TMPI_ATOMIC_HAVE_NATIVE_SWAP
+#endif
 
 
 /** Initialize spinlock
@@ -542,10 +523,10 @@ static inline int tMPI_Atomic_ptr_cas(tMPI_Atomic_ptr_t * a, void *old_val,
  *
  *  \hideinitializer
  */
-#ifdef DOXYGEN
-void tMPI_Spinlock_init( tMPI_Spinlock_t &x);
-#else
-#define tMPI_Spinlock_init(x)       tMPI_Thread_mutex_init((x)->lock)
+TMPI_EXPORT
+void tMPI_Spinlock_init( tMPI_Spinlock_t *x);
+#ifndef DOXYGEN
+#define TMPI_ATOMIC_HAVE_NATIVE_SPINLOCK
 #endif
 
 /** Acquire spinlock
@@ -555,11 +536,8 @@ void tMPI_Spinlock_init( tMPI_Spinlock_t &x);
  *
  *  \param x     Spinlock pointer
  */
-#ifdef DOXYGEN
-void tMPI_Spinlock_lock( tMPI_Spinlock_t &x);
-#else
-#define tMPI_Spinlock_lock(x)       tMPI_Thread_mutex_lock((x)->lock)
-#endif
+TMPI_EXPORT
+void tMPI_Spinlock_lock( tMPI_Spinlock_t *x);
 
 
 /** Attempt to acquire spinlock
@@ -572,11 +550,8 @@ void tMPI_Spinlock_lock( tMPI_Spinlock_t &x);
  * \return 0 if the mutex was available so we could lock it,
  *         otherwise a non-zero integer (1) if the lock is busy.
  */
-#ifdef DOXYGEN
-void tMPI_Spinlock_trylock( tMPI_Spinlock_t &x);
-#else
-#define tMPI_Spinlock_trylock(x)    tMPI_Thread_mutex_trylock((x)->lock)
-#endif
+TMPI_EXPORT
+int tMPI_Spinlock_trylock( tMPI_Spinlock_t *x);
 
 /** Release spinlock
  *
@@ -584,11 +559,8 @@ void tMPI_Spinlock_trylock( tMPI_Spinlock_t &x);
  *
  *  Unlocks the spinlock, regardless if which thread locked it.
  */
-#ifdef DOXYGEN
-void tMPI_Spinlock_unlock( tMPI_Spinlock_t &x);
-#else
-#define tMPI_Spinlock_unlock(x)     tMPI_Thread_mutex_unlock((x)->lock)
-#endif
+TMPI_EXPORT
+void tMPI_Spinlock_unlock( tMPI_Spinlock_t *x);
 
 
 
@@ -601,20 +573,7 @@ void tMPI_Spinlock_unlock( tMPI_Spinlock_t &x);
  *  \return 1 if the spinlock is locked, 0 otherwise.
  */
 TMPI_EXPORT
-static inline int tMPI_Spinlock_islocked(const tMPI_Spinlock_t *x)
-{
-    if (tMPI_Spinlock_trylock(x) != 0)
-    {
-        /* It was locked */
-        return 1;
-    }
-    else
-    {
-        /* We just locked it */
-        tMPI_Spinlock_unlock(x);
-        return 0;
-    }
-}
+int tMPI_Spinlock_islocked( tMPI_Spinlock_t *x);
 
 /** Wait for a spinlock to become available
  *
@@ -625,71 +584,14 @@ static inline int tMPI_Spinlock_islocked(const tMPI_Spinlock_t *x)
  *  \param x  Spinlock pointer
  */
 TMPI_EXPORT
-static inline void tMPI_Spinlock_wait(tMPI_Spinlock_t *x)
-{
-    tMPI_Spinlock_lock(x);
-    /* Got the lock now, so the waiting is over */
-    tMPI_Spinlock_unlock(x);
-}
-
-
-#endif
-
-
-
-/* only do this if there was no better solution */
-#ifndef TMPI_HAVE_SWAP
-/** Atomic swap operation.
-
-   Atomically swaps the data in the tMPI_Atomic_t operand with the value of b.
-   NOTE: DON'T USE YET! (This has no good asm counterparts on many architectures).
-
-   \param a  Pointer to atomic type
-   \param b  Value to swap
-   \return the original value of a
- */
-TMPI_EXPORT
-static inline int tMPI_Atomic_swap(tMPI_Atomic_t *a, int b)
-{
-    int oldval;
-    do
-    {
-        oldval = (int)(a->value);
-    }
-    while (!tMPI_Atomic_cas(a, oldval, b));
-    return oldval;
-}
-/** Atomic swap pointer operation.
-
-   Atomically swaps the pointer in the tMPI_Atomic_ptr_t operand with the
-   value of b.
-   NOTE: DON'T USE YET! (This has no good asm counterparts on many architectures).
-
-   \param a  Pointer to atomic type
-   \param b  Value to swap
-   \return the original value of a
- */
-TMPI_EXPORT
-static inline void *tMPI_Atomic_ptr_swap(tMPI_Atomic_ptr_t *a, void *b)
-{
-    void *oldval;
-    do
-    {
-        oldval = (void*)(a->value);
-    }
-    while (!tMPI_Atomic_ptr_cas(a, oldval, b));
-    return oldval;
-}
-#endif
+void tMPI_Spinlock_wait(tMPI_Spinlock_t *x);
 
-/* only define this if there were no separate acquire and release barriers */
-#ifndef TMPI_HAVE_ACQ_REL_BARRIERS
 
-/* if they're not defined explicitly, we just make full barriers out of both */
-#define tMPI_Atomic_memory_barrier_acq tMPI_Atomic_memory_barrier
-#define tMPI_Atomic_memory_barrier_rel tMPI_Atomic_memory_barrier
+#endif /* platform-specific checks */
 
-#endif
+/* now define all the atomics that are not avaible natively. These
+   are done on the assumption that a native CAS does exist. */
+#include "atomic/derived.h"
 
 /* this allows us to use the inline keyword without breaking support for
    some compilers that don't support it: */
@@ -697,6 +599,13 @@ static inline void *tMPI_Atomic_ptr_swap(tMPI_Atomic_ptr_t *a, void *b)
 #undef inline
 #endif
 
+#if !defined(TMPI_NO_ATOMICS) && !defined(TMPI_ATOMICS)
+/* Set it here to make sure the user code can check this without having to have
+   a config.h */
+/** Indicates that support for atomic operations is present. */
+#define TMPI_ATOMICS
+#endif
+
 
 #ifdef __cplusplus
 }
index 9d477d441ce84f968da705ea43ccb38b3c391bb2..a4c7f2787da24f60980f9f6471852e3d3f0a1325 100644 (file)
@@ -7,13 +7,13 @@
 #if ((defined(__GNUC__) || defined(__INTEL_COMPILER) || defined(__PATHSCALE__)  || defined(__PGIC__)) && (defined(__i386__) || defined(__x86_64__)))
 #define TMPI_CYCLE_COUNT
 /* x86 or x86-64 with GCC inline assembly */
-typedef unsigned long long tmpi_cycles_t;
+typedef unsigned long long tMPI_Cycles_t;
 
-static __inline__ tmpi_cycles_t tmpi_cycles_read(void)
+static __inline__ tMPI_Cycles_t tMPI_Cycles_read(void)
 {
     /* x86 with GCC inline assembly - pentium TSC register */
-    tmpi_cycles_t   cycle;
-    unsigned        low, high;
+    tMPI_Cycles_t cycle;
+    unsigned      low, high;
 
 #ifdef HAVE_RDTSCP
     __asm__ __volatile__("rdtscp" : "=a" (low), "=d" (high) :: "ecx" );
@@ -27,26 +27,26 @@ static __inline__ tmpi_cycles_t tmpi_cycles_read(void)
 }
 #elif (defined(__INTEL_COMPILER) && defined(__ia64__))
 #define TMPI_CYCLE_COUNT
-typedef unsigned long tmpi_cycles_t;
-static __inline__ tmpi_cycles_t tmpi_cycles_read(void)
+typedef unsigned long tMPI_Cycles_t;
+static __inline__ tMPI_Cycles_t tMPI_Cycles_read(void)
 {
     /* Intel compiler on ia64 */
     return __getReg(_IA64_REG_AR_ITC);
 }
 #elif defined(__GNUC__) && defined(__ia64__)
 #define TMPI_CYCLE_COUNT
-typedef unsigned long tmpi_cycles_t;
-static __inline__ tmpi_cycles_t tmpi_cycles_read(void)
+typedef unsigned long tMPI_Cycles_t;
+static __inline__ tMPI_Cycles_t tMPI_Cycles_read(void)
 {
     /* ia64 with GCC inline assembly */
-    tmpi_cycles_t ret;
+    tMPI_Cycles_t ret;
     __asm__ __volatile__ ("mov %0=ar.itc" : "=r" (ret));
     return ret;
 }
 #elif defined(_MSC_VER)
 #define TMPI_CYCLE_COUNT
-typedef __int64 tmpi_cycles_t;
-static __inline tmpi_cycles_t tmpi_cycles_read(void)
+typedef __int64 tMPI_Cycles_t;
+static __inline tMPI_Cycles_t tMPI_Cycles_read(void)
 {
 #ifdef HAVE_RDTSCP
     unsigned int ui;
diff --git a/src/gromacs/legacyheaders/thread_mpi/atomic/derived.h b/src/gromacs/legacyheaders/thread_mpi/atomic/derived.h
new file mode 100644 (file)
index 0000000..68b0fe5
--- /dev/null
@@ -0,0 +1,186 @@
+/*
+   This source code file is part of thread_mpi.
+   Written by Sander Pronk, Erik Lindahl, and possibly others.
+
+   Copyright (c) 2013, Sander Pronk, Erik Lindahl.
+   All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are met:
+   1) Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+   2) Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions and the following disclaimer in the
+   documentation and/or other materials provided with the distribution.
+   3) Neither the name of the copyright holders nor the
+   names of its contributors may be used to endorse or promote products
+   derived from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY US ''AS IS'' AND ANY
+   EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+   WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+   DISCLAIMED. IN NO EVENT SHALL WE BE LIABLE FOR ANY
+   DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+   (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+   LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+   ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+   If you want to redistribute modifications, please consider that
+   scientific software is very special. Version control is crucial -
+   bugs must be traceable. We will be happy to consider code for
+   inclusion in the official distribution, but derived work should not
+   be called official thread_mpi. Details are found in the README & COPYING
+   files.
+ */
+
+
+/* These functions are fallback definitions for when there are no native
+   variants for fetch-add, spinlock, etc., but there is a native
+   compare-and-swap. */
+
+
+/* only define this if there were no separate acquire and release barriers */
+#ifndef TMPI_HAVE_ACQ_REL_BARRIERS
+
+/* if they're not defined explicitly, we just make full barriers out of both */
+#define tMPI_Atomic_memory_barrier_acq tMPI_Atomic_memory_barrier
+#define tMPI_Atomic_memory_barrier_rel tMPI_Atomic_memory_barrier
+
+#endif
+
+#ifndef TMPI_ATOMIC_HAVE_NATIVE_FETCH_ADD
+TMPI_EXPORT
+static inline int tMPI_Atomic_fetch_add(tMPI_Atomic_t *a, int i)
+{
+    int newval, oldval;
+    do
+    {
+        tMPI_Atomic_memory_barrier_acq();
+        oldval = tMPI_Atomic_get(a);
+        newval = oldval + i;
+    }
+    while (!tMPI_Atomic_cas(a, oldval, newval));
+    tMPI_Atomic_memory_barrier_rel();
+    return oldval;
+}
+#endif /* TMPI_HAVE_FETCH_ADD */
+
+
+#ifndef TMPI_ATOMIC_HAVE_NATIVE_ADD_RETURN
+TMPI_EXPORT
+static inline int tMPI_Atomic_add_return(tMPI_Atomic_t *a, int i)
+{
+    /* implement in terms of fetch-add */
+    return tMPI_Atomic_fetch_add(a, i) + i;
+}
+#endif /* TMPI_HAVE_ADD_RETURN */
+
+
+
+
+
+/* only do this if there was no better solution */
+#ifndef TMPI_ATOMIC_HAVE_NATIVE_SWAP
+TMPI_EXPORT
+static inline int tMPI_Atomic_swap(tMPI_Atomic_t *a, int b)
+{
+    int oldval;
+    do
+    {
+        oldval = (int)(a->value);
+    }
+    while (!tMPI_Atomic_cas(a, oldval, b));
+    return oldval;
+}
+
+
+TMPI_EXPORT
+static inline void *tMPI_Atomic_ptr_swap(tMPI_Atomic_ptr_t *a, void *b)
+{
+    void *oldval;
+    do
+    {
+        oldval = (void*)(a->value);
+    }
+    while (!tMPI_Atomic_ptr_cas(a, oldval, b));
+    return oldval;
+}
+#endif
+
+
+
+#ifndef TMPI_ATOMIC_HAVE_NATIVE_SPINLOCK
+
+typedef struct tMPI_Spinlock
+{
+    tMPI_Atomic_t a;
+}
+tMPI_Spinlock_t;
+
+#define TMPI_SPINLOCK_INITIALIZER   { 0 }
+
+
+
+TMPI_EXPORT
+static inline void tMPI_Spinlock_init(tMPI_Spinlock_t *x)
+{
+    tMPI_Atomic_set(&(x->a), 0);
+}
+
+
+TMPI_EXPORT
+static inline void tMPI_Spinlock_lock(tMPI_Spinlock_t *x)
+{
+    tMPI_Atomic_memory_barrier_acq();
+    do
+    {
+        while (tMPI_Atomic_get(&(x->a)) == 1)
+        {
+            tMPI_Atomic_memory_barrier_acq();
+        }
+    }
+    while (!tMPI_Atomic_cas(&(x->a), 0, 1));
+    tMPI_Atomic_memory_barrier_acq();
+}
+
+
+TMPI_EXPORT
+static inline int tMPI_Spinlock_trylock(tMPI_Spinlock_t *x)
+{
+    int ret;
+    tMPI_Atomic_memory_barrier_acq();
+    ret = !tMPI_Atomic_cas(&(x->a), 0, 1);
+    return ret;
+}
+
+
+TMPI_EXPORT
+static inline void tMPI_Spinlock_unlock(tMPI_Spinlock_t *x)
+{
+    tMPI_Atomic_memory_barrier_rel();
+    tMPI_Atomic_set(&(x->a), 0);
+    tMPI_Atomic_memory_barrier_rel();
+}
+
+
+TMPI_EXPORT
+static inline int tMPI_Spinlock_islocked(const tMPI_Spinlock_t *x)
+{
+    int ret;
+    tMPI_Atomic_memory_barrier_rel();
+    ret = (tMPI_Atomic_get(&(x->a)) != 0);
+    return ret;
+}
+
+
+TMPI_EXPORT
+static inline void tMPI_Spinlock_wait(tMPI_Spinlock_t *x)
+{
+    do
+    {
+    }
+    while (tMPI_Spinlock_islocked(x));
+}
+#endif /* TMPI_ATOMIC_HAVE_NATIVE_SPINLOCK */
diff --git a/src/gromacs/legacyheaders/thread_mpi/atomic/fujitsu_sparc64.h b/src/gromacs/legacyheaders/thread_mpi/atomic/fujitsu_sparc64.h
new file mode 100644 (file)
index 0000000..0566338
--- /dev/null
@@ -0,0 +1,88 @@
+/*
+   This source code file is part of thread_mpi.
+   Written by Sander Pronk, Erik Lindahl, and possibly others.
+
+   Copyright (c) 2013, Sander Pronk, Erik Lindahl.
+   All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are met:
+   1) Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+   2) Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions and the following disclaimer in the
+   documentation and/or other materials provided with the distribution.
+   3) Neither the name of the copyright holders nor the
+   names of its contributors may be used to endorse or promote products
+   derived from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY US ''AS IS'' AND ANY
+   EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+   WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+   DISCLAIMED. IN NO EVENT SHALL WE BE LIABLE FOR ANY
+   DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+   (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+   LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+   ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+   If you want to redistribute modifications, please consider that
+   scientific software is very special. Version control is crucial -
+   bugs must be traceable. We will be happy to consider code for
+   inclusion in the official distribution, but derived work should not
+   be called official thread_mpi. Details are found in the README & COPYING
+   files.
+ */
+
+#define tMPI_Atomic_memory_barrier() { asm ("membar   #StoreStore | #LoadStore | #LoadLoad | #StoreLoad "); }
+#define tMPI_Atomic_memory_barrier_acq() { asm ("membar   #StoreStore | #StoreLoad ");  }
+#define tMPI_Atomic_memory_barrier_rel() { asm ("membar   #LoadStore | #StoreStore ");  }
+#define TMPI_HAVE_ACQ_REL_BARRIERS
+
+
+typedef struct tMPI_Atomic
+{
+    volatile int value __attribute__ ((aligned(64)));
+}
+tMPI_Atomic_t;
+
+
+typedef struct tMPI_Atomic_ptr
+{
+    volatile char* volatile* value __attribute__ ((aligned(64)));  /*!< Volatile, to avoid compiler aliasing */
+}
+tMPI_Atomic_ptr_t;
+
+
+/* On sparc64, aligned 32-bit and 64-bit memory accesses are atomic */
+#define tMPI_Atomic_get(a)   (int)((a)->value)
+#define tMPI_Atomic_set(a, i)  (((a)->value) = (i))
+#define tMPI_Atomic_ptr_get(a)   ((a)->value)
+#define tMPI_Atomic_ptr_set(a, i)  (((a)->value) = (i))
+
+#define TMPI_SPINLOCK_INITIALIZER   { 0 }
+
+/* we just define the CAS operation. Fetch-and-add and spinlocks are
+   implemented through derived.h; this follows the recommendations of the
+   Sparc v9 programming specs. */
+
+static inline int tMPI_Atomic_cas(tMPI_Atomic_t *a, int oldval, int newval)
+{
+    asm ("cas [%2], %1, %0"
+         : "=&r" (newval)
+         : "r" (oldval), "r" (&(a->value)), "0" (newval)
+         : "memory");
+    return newval == oldval;
+}
+
+
+static inline int tMPI_Atomic_ptr_cas(tMPI_Atomic_ptr_t *a, void* oldval,
+                                      void* newval)
+{
+    asm ("casx [%2], %1, %0         "
+         : "=&r" (newval)
+         : "r" (oldval), "r" (&(a->value)), "0" (newval)
+         : "memory");
+    return newval == oldval;
+}
index 756f1bd792de96f3c1f1de9f10b32f43669ac713..39537f3f4f9e9d4a494578177339c6d3a36e0fde 100644 (file)
@@ -60,10 +60,6 @@ typedef struct tMPI_Atomic_ptr
 tMPI_Atomic_ptr_t;
 
 
-
-#define TMPI_SPINLOCK_INITIALIZER   { 0 }
-
-
 /* for now we simply assume that int and void* assignments are atomic */
 #define tMPI_Atomic_get(a)  ((int)( (a)->value) )
 #define tMPI_Atomic_set(a, i)  (((a)->value) = (i))
index 982853ca426f3ee66d088f262da31e9205d67a3e..f3c3a174b0157e1129d4ca93fe72dceea2c628e9 100644 (file)
@@ -55,8 +55,6 @@ typedef struct tMPI_Atomic_ptr
 tMPI_Atomic_ptr_t;
 
 
-#define TMPI_SPINLOCK_INITIALIZER   { 0 }
-
 
 #define tMPI_Atomic_get(a)   ((a)->value)
 #define tMPI_Atomic_set(a, i)  (((a)->value) = (i))
@@ -67,7 +65,7 @@ tMPI_Atomic_ptr_t;
 
 
 #ifndef __INTEL_COMPILER
-#define TMPI_HAVE_SWAP
+#define TMPI_ATOMIC_HAVE_NATIVE_SWAP
 /* xchg operations: */
 /* ia64 xchg */
 static inline int tMPI_Atomic_swap(tMPI_Atomic_t *a, int b)
@@ -112,7 +110,6 @@ int _InterlockedCompareExchange(volatile int *dest, int xchg, int comp);
                                          void* comp);*/
 unsigned __int64 __fetchadd4_rel(unsigned int *addend, const int increment);
 /* ia64 memory barrier */
-/*#define tMPI_Atomic_memory_barrier() __memory_barrier()*/
 #define tMPI_Atomic_memory_barrier() __sync_synchronize()
 /* ia64 cmpxchg */
 #define tMPI_Atomic_cas(a, oldval, newval) \
@@ -127,9 +124,9 @@ unsigned __int64 __fetchadd4_rel(unsigned int *addend, const int increment);
 /* ia64 fetchadd, but it only works with increments +/- 1,4,8,16 */
 #define tMPI_ia64_fetchadd(a, inc)  __fetchadd4_rel(a, inc)
 
-#define TMPI_HAVE_SWAP
 #define tMPI_Atomic_swap(a, b) _InterlockedExchange( &((a)->value), (b))
 #define tMPI_Atomic_ptr_swap(a, b) _InterlockedExchangePointer( &((a)->value), (b))
+#define TMPI_ATOMIC_HAVE_NATIVE_SWAP
 
 #elif defined __GNUC__
 
@@ -213,6 +210,7 @@ static inline int tMPI_Atomic_add_return(tMPI_Atomic_t *a, int i)
     }
     return (int)newval;
 }
+#define TMPI_ATOMIC_HAVE_NATIVE_ADD_RETURN
 
 
 
@@ -244,73 +242,7 @@ static inline int tMPI_Atomic_fetch_add(tMPI_Atomic_t *a, int i)
     }
     return (int)oldval;
 }
-
-typedef struct tMPI_Spinlock
-{
-    volatile unsigned int   lock; /*!< Volatile, to avoid compiler aliasing */
-}
-tMPI_Spinlock_t;
-
-
-
-static inline void tMPI_Spinlock_init(tMPI_Spinlock_t *x)
-{
-    x->lock = 0;
-}
-
-
-static inline void tMPI_Spinlock_lock(tMPI_Spinlock_t *x)
-{
-    tMPI_Atomic_t *a = (tMPI_Atomic_t *) x;
-    int            succeeded;
-    succeeded = tMPI_Atomic_cas(a, 0, 1);
-    if (!succeeded)
-    {
-        do
-        {
-            while (a->value != 0)
-            {
-                tMPI_Atomic_memory_barrier();
-            }
-            succeeded = tMPI_Atomic_cas(a, 0, 1);
-        }
-        while (!succeeded);
-    }
-}
-
-
-static inline int tMPI_Spinlock_trylock(tMPI_Spinlock_t *x)
-{
-    return (tMPI_Atomic_cas( ((tMPI_Atomic_t *)x), 0, 1));
-}
-
-
-static inline void tMPI_Spinlock_unlock(tMPI_Spinlock_t *x)
-{
-    do
-    {
-        tMPI_Atomic_memory_barrier();
-        x->lock = 0;
-    }
-    while (0);
-}
-
-
-static inline int tMPI_Spinlock_islocked(const tMPI_Spinlock_t *x)
-{
-    return (x->lock != 0);
-}
-
-
-static inline void tMPI_Spinlock_wait(tMPI_Spinlock_t *x)
-{
-
-    do
-    {
-        tMPI_Atomic_memory_barrier();
-    }
-    while (tMPI_Spinlock_islocked(x));
-}
+#define TMPI_ATOMIC_HAVE_NATIVE_FETCH_ADD
 
 #endif
 
index 4fe10a8dfc55f00c0c5df1759725f77d3e71aa40..f909cb91ec7026eb148e94acc47312eec73fae56 100644 (file)
 #define tMPI_Atomic_memory_barrier()  __sync_synchronize()
 
 
-static inline int tMPI_Atomic_add_return(tMPI_Atomic_t *a, volatile int i)
-{
-    return __sync_add_and_fetch( &(a->value), i);
-}
-
-static inline int tMPI_Atomic_fetch_add(tMPI_Atomic_t *a, volatile int i)
-{
-    return __sync_fetch_and_add( &(a->value), i);
-}
-
 
+TMPI_EXPORT
 static inline int tMPI_Atomic_cas(tMPI_Atomic_t *a, int oldval, int newval)
 {
     return __sync_bool_compare_and_swap( &(a->value), oldval, newval);
 }
 
-
-#if 0
-/* these definitions are only used if there's no assembly versions for them:
-   they're inefficient because they use compare-and-swap instead of just
-   swap. */
-static inline int tMPI_Atomic_swap(tMPI_Atomic_t *a, int b)
-{
-    int oldval;
-    do
-    {
-        oldval = a->value;
-    }
-    while (__sync_val_compare_and_swap( &(a->value), oldval, b) != oldval);
-
-    return oldval;
-}
-
-static inline void* tMPI_Atomic_ptr_swap(tMPI_Atomic_ptr_t *a, void *b)
-{
-    void *oldval;
-    do
-    {
-        oldval = a->value;
-    }
-    while (__sync_val_compare_and_swap( &(a->value), oldval, b) != oldval);
-
-    return oldval;
-}
-#endif
-
-
-
+TMPI_EXPORT
 static inline int tMPI_Atomic_ptr_cas(tMPI_Atomic_ptr_t* a, void *oldval,
                                       void *newval)
 {
@@ -103,3 +63,18 @@ static inline int tMPI_Atomic_ptr_cas(tMPI_Atomic_ptr_t* a, void *oldval,
                                           (size_t)newval) );
 #endif
 }
+
+TMPI_EXPORT
+static inline int tMPI_Atomic_add_return(tMPI_Atomic_t *a, volatile int i)
+{
+    return __sync_add_and_fetch( &(a->value), i);
+}
+#define TMPI_ATOMIC_HAVE_NATIVE_ADD_RETURN
+
+
+TMPI_EXPORT
+static inline int tMPI_Atomic_fetch_add(tMPI_Atomic_t *a, volatile int i)
+{
+    return __sync_fetch_and_add( &(a->value), i);
+}
+#define TMPI_ATOMIC_HAVE_NATIVE_FETCH_ADD
index 9123412a2546035d8aa3600a57b93abc38332fa6..928d770e5d55fb8ceae2fdb0a18148f41af523f7 100644 (file)
@@ -66,12 +66,11 @@ typedef struct tMPI_Spinlock
     volatile unsigned int lock;   /*!< Volatile, to avoid compiler aliasing */
 }
 tMPI_Spinlock_t;
+#define TMPI_ATOMIC_HAVE_NATIVE_SPINLOCK
 
 
 #define TMPI_SPINLOCK_INITIALIZER   { 0 }
 
-#define TMPI_HAVE_SWAP
-
 #define tMPI_Atomic_get(a)        ((a)->value)
 #define tMPI_Atomic_set(a, i)     (((a)->value) = (i))
 
@@ -90,7 +89,7 @@ tMPI_Spinlock_t;
 
 
 
-#define TMPI_HAVE_ASM_SWAP
+#define TMPI_ATOMIC_HAVE_NATIVE_SWAP
 static inline int tMPI_Atomic_swap(tMPI_Atomic_t *a, int b)
 {
     int ret;
@@ -183,6 +182,7 @@ static inline int tMPI_Atomic_ptr_cas(tMPI_Atomic_ptr_t *a, void *oldval,
     return prev == oldval;
 }
 
+#define TMPI_ATOMIC_HAVE_NATIVE_ADD_RETURN
 static inline int tMPI_Atomic_add_return(tMPI_Atomic_t *a, int i)
 {
     int t;
@@ -200,6 +200,7 @@ static inline int tMPI_Atomic_add_return(tMPI_Atomic_t *a, int i)
 
 
 
+#define TMPI_ATOMIC_HAVE_NATIVE_FETCH_ADD
 static inline int tMPI_Atomic_fetch_add(tMPI_Atomic_t *a, int i)
 {
     int t;
index f7dbc67d7ad91fb440094d28fa5fe292aa8659bc..84a244d9004bbe700c37257c19c1279dadeb1430 100644 (file)
 
 typedef struct tMPI_Spinlock
 {
-    volatile unsigned int  lock /*__attribute__ ((aligned(64)))*/;
+    volatile unsigned int lock /*__attribute__ ((aligned(64)))*/;
 } tMPI_Spinlock_t;
 
-
-
-
 #define TMPI_SPINLOCK_INITIALIZER   { 0 }
 
+#define TMPI_ATOMIC_HAVE_NATIVE_SPINLOCK
+
 
 
 static inline void tMPI_Spinlock_init(tMPI_Spinlock_t *x)
@@ -83,18 +82,18 @@ static inline int tMPI_Spinlock_trylock(tMPI_Spinlock_t *x)
 }
 
 
-static inline void tMPI_Spinlock_unlock(tMPI_Spinlock_t *  x)
+static inline void tMPI_Spinlock_unlock(tMPI_Spinlock_t *x)
 {
     __sync_lock_release(&(x->lock));
 }
 
-static inline int tMPI_Spinlock_islocked(const tMPI_Spinlock_t *  x)
+static inline int tMPI_Spinlock_islocked(const tMPI_Spinlock_t *x)
 {
     __sync_synchronize();
     return ( x->lock == 1 );
 }
 
-static inline void tMPI_Spinlock_wait(tMPI_Spinlock_t *   x)
+static inline void tMPI_Spinlock_wait(tMPI_Spinlock_t *x)
 {
     do
     {
index e6252a8893ce84c4b6c66222fa36fc469b9a3c2c..2fd496f6ebf8b306ecf8f65182d353f6be8ee4ef 100644 (file)
@@ -79,6 +79,8 @@ typedef struct tMPI_Spinlock
 
 #define TMPI_SPINLOCK_INITIALIZER   { 0 }
 
+#define TMPI_ATOMIC_HAVE_NATIVE_SPINLOCK
+
 
 
 /* these are guaranteed to be  atomic on x86 and x86_64 */
@@ -110,23 +112,35 @@ typedef struct tMPI_Spinlock
 
 #define tMPI_Atomic_memory_barrier() __asm__ __volatile__("sfence;" : : : "memory")
 
-static inline int tMPI_Atomic_add_return(tMPI_Atomic_t *a, int i)
+#define TMPI_ATOMIC_HAVE_NATIVE_FETCH_ADD
+static inline int tMPI_Atomic_fetch_add(tMPI_Atomic_t *a, int i)
 {
-    int __i;
-
-    __i = i;
-    __asm__ __volatile__("lock ; xaddl %0, %1;"
-                         : "=r" (i) : "m" (a->value), "0" (i) : "memory");
-    return i + __i;
+    volatile int res = i;
+    /* volatile because we read and write back to the same variable in the
+       asm section.  some compilers requires this to be volatile */
+    __asm__ __volatile__("lock ; xaddl %0, %1;"      /* swap-add */
+                         : "=r" (res)                /* with register as
+                                                        output*/
+                         : "m" (a->value), "0" (res) /* and memory as input */
+                         : "memory");
+    return res;
 }
 
-static inline int tMPI_Atomic_fetch_add(tMPI_Atomic_t *a, int i)
+#define TMPI_ATOMIC_HAVE_NATIVE_ADD_RETURN
+static inline int tMPI_Atomic_add_return(tMPI_Atomic_t *a, int i)
 {
+    int          orig = i;
+    volatile int res  = i;
+
     __asm__ __volatile__("lock ; xaddl %0, %1;"
-                         : "=r" (i) : "m" (a->value), "0" (i) : "memory");
-    return i;
+                         : "=r" (res)
+                         : "m" (a->value), "0" (res)
+                         :  "memory");
+    return res + orig; /* then add again from the right value */
 }
 
+
+
 static inline int tMPI_Atomic_cas(tMPI_Atomic_t *a, int oldval, int newval)
 {
     int prev;
@@ -160,7 +174,8 @@ static inline int tMPI_Atomic_ptr_cas(tMPI_Atomic_ptr_t *a,
 
 #endif /* end of check for gcc intrinsics */
 
-#define TMPI_HAVE_SWAP
+
+#define TMPI_ATOMIC_HAVE_NATIVE_SWAP
 /* do the swap fns; we told the intrinsics that we have them. */
 static inline int tMPI_Atomic_swap(tMPI_Atomic_t *a, int b)
 {
@@ -176,11 +191,6 @@ static inline void *tMPI_Atomic_ptr_swap(tMPI_Atomic_ptr_t *a, void *b)
 {
     void *volatile *ret = (void* volatile*)b;
 #ifndef __x86_64__
-/*    __asm__ __volatile__("\txchgl %0, %1;"
-                         :"=m"(a->value),"=q"(b)
-                         :"q"(b)
-                         :"memory");
- */
     __asm__ __volatile__("\txchgl %0, %1;"
                          : "+r" (ret), "+m" (a->value)
                          :
@@ -235,7 +245,7 @@ static inline void tMPI_Spinlock_lock(tMPI_Spinlock_t *x)
 
 
 
-static inline void tMPI_Spinlock_unlock(tMPI_Spinlock_t *  x)
+static inline void tMPI_Spinlock_unlock(tMPI_Spinlock_t *x)
 {
     /* this is apparently all that is needed for unlocking a lock */
     __asm__ __volatile__(
index 8ab8d2498f9c46ecb81f8322c06089d20c918500..50bd7dc71bd95dd20dd0f072bfb5e61450e6dfbc 100644 (file)
@@ -36,8 +36,6 @@
  */
 
 
-/* Microsoft Visual C on x86, define taken from FFTW who got it from Morten Nissov */
-
 /* we need this for all the data types. We use WIN32_LEAN_AND_MEAN to avoid
       polluting the global namespace. */
 #define WIN32_LEAN_AND_MEAN
 
 typedef struct tMPI_Atomic
 {
-    LONG volatile      value;     /*!< Volatile, to avoid compiler aliasing */
+    LONG volatile value;          /*!< Volatile, to avoid compiler aliasing */
 } tMPI_Atomic_t;
 
 typedef struct tMPI_Atomic_ptr
 {
-    void* volatile      value;     /*!< Volatile, to avoid compiler aliasing */
+    void* volatile value;          /*!< Volatile, to avoid compiler aliasing */
 } tMPI_Atomic_ptr_t;
 
 typedef struct tMPI_Spinlock
 {
-    LONG volatile      lock;      /*!< Volatile, to avoid compiler aliasing */
+    LONG volatile lock;           /*!< Volatile, to avoid compiler aliasing */
 } tMPI_Spinlock_t;
 
-#define TMPI_SPINLOCK_INITIALIZER   { 0 }
-
+#define TMPI_ATOMIC_HAVE_NATIVE_SPINLOCK
 
-#define TMPI_HAVE_SWAP
+#define TMPI_SPINLOCK_INITIALIZER   { 0 }
 
 
 #define tMPI_Atomic_get(a)  ((a)->value)
@@ -83,9 +80,11 @@ typedef struct tMPI_Spinlock
 
 #define tMPI_Atomic_fetch_add(a, i)  \
     InterlockedExchangeAdd((LONG volatile *)(a), (LONG) (i))
+#define TMPI_ATOMIC_HAVE_NATIVE_FETCH_ADD
 
 #define tMPI_Atomic_add_return(a, i)  \
     ( (i) + InterlockedExchangeAdd((LONG volatile *)(a), (LONG) (i)) )
+#define TMPI_ATOMIC_HAVE_NATIVE_ADD_RETURN
 
 #define tMPI_Atomic_cas(a, oldval, newval) \
     (InterlockedCompareExchange((LONG volatile *)(a), (LONG) (newval), (LONG) (oldval)) == (LONG)oldval)
@@ -94,6 +93,7 @@ typedef struct tMPI_Spinlock
     (InterlockedCompareExchangePointer(&((a)->value), (PVOID) (newval),  \
                                        (PVOID) (oldval)) == (PVOID)oldval)
 
+#define TMPI_ATOMIC_HAVE_NATIVE_SWAP
 #define tMPI_Atomic_swap(a, b) \
     InterlockedExchange((LONG volatile *)(a), (LONG) (b))
 
@@ -102,7 +102,7 @@ typedef struct tMPI_Spinlock
 
 
 
-static inline void tMPI_Spinlock_init(tMPI_Spinlock_t *   x)
+static inline void tMPI_Spinlock_init(tMPI_Spinlock_t *x)
 {
     x->lock = 0;
 }
@@ -115,19 +115,19 @@ static inline void tMPI_Spinlock_init(tMPI_Spinlock_t *   x)
     InterlockedCompareExchange((LONG volatile *)(x), 1, 0)
 
 
-static inline void tMPI_Spinlock_unlock(tMPI_Spinlock_t *   x)
+static inline void tMPI_Spinlock_unlock(tMPI_Spinlock_t *x)
 {
     x->lock = 0;
 }
 
 
-static inline int tMPI_Spinlock_islocked(const tMPI_Spinlock_t *   x)
+static inline int tMPI_Spinlock_islocked(const tMPI_Spinlock_t *x)
 {
     return (*(volatile signed char *)(&(x)->lock) != 0);
 }
 
 
-static inline void tMPI_Spinlock_wait(tMPI_Spinlock_t *   x)
+static inline void tMPI_Spinlock_wait(tMPI_Spinlock_t *x)
 {
     while (tMPI_Spinlock_islocked(x))
     {
index 52c39081cdac265de2500009cc2f59a6f1ebd3d3..3f8024a702325e5c27be59aa2c11179f4714320f 100644 (file)
@@ -64,10 +64,6 @@ typedef struct tMPI_Atomic_ptr
 tMPI_Atomic_ptr_t;
 
 
-
-#define TMPI_SPINLOCK_INITIALIZER   { 0 }
-
-
 /* for now we simply assume that int and void* assignments are atomic */
 #define tMPI_Atomic_get(a)  ((int)( (a)->value) )
 #define tMPI_Atomic_set(a, i)  (((a)->value) = (i))
@@ -76,16 +72,6 @@ tMPI_Atomic_ptr_t;
 #define tMPI_Atomic_ptr_get(a)  ((void*)((a)->value) )
 #define tMPI_Atomic_ptr_set(a, i)  (((a)->value) = (void*)(i))
 
-static inline int tMPI_Atomic_add_return(tMPI_Atomic_t *a, volatile int i)
-{
-    return (int) atomic_add_int_nv(&a->value, i);
-}
-
-static inline int tMPI_Atomic_fetch_add(tMPI_Atomic_t *a, volatile int i)
-{
-    return (int) atomic_add_int_nv(&a->value, i) - i;
-}
-
 
 static inline int tMPI_Atomic_cas(tMPI_Atomic_t *a, int oldval, int newval)
 {
@@ -100,64 +86,14 @@ static inline int tMPI_Atomic_ptr_cas(tMPI_Atomic_ptr_t* a, void *oldval,
     return atomic_cas_ptr(&(a->value), oldval, newval) == oldval;
 }
 
-
-
-typedef struct tMPI_Spinlock
-{
-    volatile unsigned long  lock;
-} tMPI_Spinlock_t;
-
-#define TMPI_SPINLOCK_INITIALIZER   { 0 }
-
-static inline unsigned long tas(volatile unsigned long *ptr)
-{
-    unsigned long result;
-    __asm__ __volatile__("          \
-            ldstub [%1], %0         "
-                         : "=r" (result)
-                         : "r" (ptr)
-                         : "memory");
-    return result;
-}
-
-
-static inline void tMPI_Spinlock_init(tMPI_Spinlock_t *x)
-{
-    x->lock = 0;
-}
-
-
-static inline void tMPI_Spinlock_lock(tMPI_Spinlock_t *x)
-{
-    do
-    {
-    }
-    while (tas(&(x->lock)) == 1);
-}
-
-
-static inline int tMPI_Spinlock_trylock(tMPI_Spinlock_t *x)
-{
-    return tas(&(x->lock));
-}
-
-
-static inline void tMPI_Spinlock_unlock(tMPI_Spinlock_t *  x)
-{
-    x->lock = 0;
-}
-
-static inline int tMPI_Spinlock_islocked(const tMPI_Spinlock_t *  x)
+static inline int tMPI_Atomic_add_return(tMPI_Atomic_t *a, volatile int i)
 {
-    tMPI_Atomic_memory_barrier();
-    return ( x->lock == 1 );
+    return (int) atomic_add_int_nv(&a->value, i);
 }
+#define TMPI_ATOMIC_HAVE_NATIVE_ADD_RETURN
 
-static inline void tMPI_Spinlock_wait(tMPI_Spinlock_t *   x)
+static inline int tMPI_Atomic_fetch_add(tMPI_Atomic_t *a, volatile int i)
 {
-    do
-    {
-    }
-    while (x->lock == 1);
-    tMPI_Atomic_memory_barrier();
+    return (int) atomic_add_int_nv(&a->value, i) - i;
 }
+#define TMPI_ATOMIC_HAVE_NATIVE_FETCH_ADD
index b08474a8f67acadc9110e55fde1f7a51ed53f1c1..0dd1f6753a6da2b1f00b1f93e785f95be3ac6e85 100644 (file)
@@ -93,6 +93,7 @@ typedef struct tMPI_Spinlock
     volatile int lock __attribute__ ((aligned(64)));
 }
 tMPI_Spinlock_t;
+#define TMPI_ATOMIC_HAVE_NATIVE_SPINLOCK
 
 
 
@@ -189,6 +190,7 @@ static inline int tMPI_Atomic_add_return(tMPI_Atomic_t *a, int i)
     return t;
 #endif
 }
+#define TMPI_ATOMIC_HAVE_NATIVE_ADD_RETURN
 
 
 
@@ -226,6 +228,7 @@ static inline int tMPI_Atomic_fetch_add(tMPI_Atomic_t *a, int i)
     return (t - i);
 #endif
 }
+#define TMPI_ATOMIC_HAVE_NATIVE_FETCH_ADD
 
 
 static inline void tMPI_Spinlock_init(tMPI_Spinlock_t *x)
index 2c238774978648c1306704aa5d03a5266a285c85..058e3a8d9b604f1e1d13abb03d8e71ec009eabcc 100644 (file)
@@ -103,7 +103,7 @@ void tMPI_Event_signal(tMPI_Event *ev);
 #define tMPI_Event_signal(ev) \
     { \
         tMPI_Atomic_memory_barrier_rel(); \
-        tMPI_Atomic_add_return( &((ev)->sync), 1); \
+        tMPI_Atomic_fetch_add( &((ev)->sync), 1); \
     }
 #endif
 
index 4e026824cacc110372c6ff5dc94af93bfacefeaf..f30e8bf740e594b1f879e7f96479d67dbcb0df6c 100644 (file)
@@ -100,7 +100,7 @@ int tMPI_Lock_trylock(tMPI_Lock_t *lock);
  *  \param lock  Pointer to previously created lock.
  */
 TMPI_EXPORT
-int tMPI_Lock_islocked(const tMPI_Lock_t *lock);
+int tMPI_Lock_islocked(tMPI_Lock_t *lock);
 
 
 
index d15ee357e044cd9caceabb9adb97ded31c418047..8d8d5fd4850387d94fc23b44b61476d53e8c848e 100644 (file)
@@ -103,7 +103,8 @@ typedef struct tMPI_Thread* tMPI_Thread_t;
  */
 typedef struct
 {
-    tMPI_Atomic_t      initialized; /*!< Whether \a mutex has been initialized. */
+    tMPI_Atomic_t      initialized; /*!< Whether \a mutex has been
+                                       initialized. */
     struct tMPI_Mutex* mutex;       /*!< Actual mutex data structure. */
 }  tMPI_Thread_mutex_t;
 /*! \brief Static initializer for tMPI_Thread_mutex_t
@@ -125,7 +126,8 @@ typedef struct
  */
 typedef struct
 {
-    tMPI_Atomic_t           initialized; /*!< Whether \a key has been initialized. */
+    tMPI_Atomic_t           initialized; /*!< Whether \a key has been
+                                            initialized. */
     struct tMPI_Thread_key *key;         /*!< Actual key data structure. */
 } tMPI_Thread_key_t;
 
@@ -178,8 +180,10 @@ typedef struct
  */
 typedef struct
 {
-    tMPI_Atomic_t            initialized; /*!< Whether \a condp has been initialized. */
-    struct tMPI_Thread_cond* condp;       /*!< Actual condition variable data structure. */
+    tMPI_Atomic_t            initialized; /*!< Whether \a condp has been
+                                             initialized. */
+    struct tMPI_Thread_cond* condp;       /*!< Actual condition variable data
+                                             structure. */
 } tMPI_Thread_cond_t;
 /*! \brief Static initializer for tMPI_Thread_cond_t
  *
@@ -451,7 +455,7 @@ int tMPI_Thread_mutex_lock(tMPI_Thread_mutex_t *mtx);
  *  return code (usually meaning the mutex was already locked).
  *
  *  \param mtx  Pointer to the mutex to try and lock
- *  \return 0 or a non-zero return error code.
+ *  \return 0 if locked, non-zero if not locked or an error occurred.
  */
 TMPI_EXPORT
 int tMPI_Thread_mutex_trylock(tMPI_Thread_mutex_t *mtx);
index 5ce8f9c0e14d3cc060e62b85e04534bdbfa0f577..d5f0e4bf4d09ac82c4015159ac9a7c1efd33d63e 100644 (file)
@@ -174,33 +174,36 @@ extern const tMPI_Datatype TMPI_POINTER;            /**< pointer (thread_mpi
 /** Error codes */
 enum
 {
-    TMPI_SUCCESS = 0,               /*!< No error */
-    TMPI_ERR_MALLOC,                /*!< Out of memory */
-    TMPI_ERR_INIT,                  /*!< Initialization error */
-    TMPI_ERR_FINALIZE,              /*!< Finalize error */
-    TMPI_ERR_GROUP,                 /*!< Group error */
-    TMPI_ERR_COMM,                  /*!< Comm error */
-    TMPI_ERR_STATUS,                /*!< Status error */
-    TMPI_ERR_GROUP_RANK,            /*!< Group rank error */
-    TMPI_ERR_DIMS,
-    TMPI_ERR_COORDS,
-    TMPI_ERR_CART_CREATE_NPROCS,
-    TMPI_ERR_XFER_COUNTERPART,
-    TMPI_ERR_XFER_BUFSIZE,
-    TMPI_ERR_XFER_BUF_OVERLAP,
-    TMPI_ERR_SEND_DEST,
-    TMPI_ERR_RECV_SRC,
-    TMPI_ERR_BUF,
-    TMPI_ERR_MULTI_MISMATCH,
-    TMPI_ERR_OP_FN,
-    TMPI_ERR_ENVELOPES,
-    TMPI_ERR_REQUESTS,
-    TMPI_ERR_IN_STATUS,
-    TMPI_ERR_PROCNR,                /*!< Hardware processor number (such as for
-                                         thread affinity) error */
-    TMPI_FAILURE,
-    TMPI_ERR_UNKNOWN,
-    N_TMPI_ERR  /* this must be the last one */
+    TMPI_SUCCESS = 0,            /*!< No error */
+    TMPI_ERR_NO_MEM,             /*!< Out of memory */
+    TMPI_ERR_IO,                 /*!< I/O Error (used for system errors) */
+    TMPI_ERR_INIT,               /*!< Initialization error */
+    TMPI_ERR_FINALIZE,           /*!< Finalize error */
+    TMPI_ERR_GROUP,              /*!< Group error */
+    TMPI_ERR_COMM,               /*!< Comm error */
+    TMPI_ERR_STATUS,             /*!< Status error */
+    TMPI_ERR_GROUP_RANK,         /*!< Group rank error */
+    TMPI_ERR_DIMS,               /*!< Invalid topology dimensions */
+    TMPI_ERR_COORDS,             /*!< Invalid topology coordinates */
+    TMPI_ERR_CART_CREATE_NPROCS, /*!< Not enough processes for topology*/
+    TMPI_ERR_XFER_COUNTERPART,   /*!< Invalid counterpart for xfer */
+    TMPI_ERR_XFER_BUFSIZE,       /*!< buffer size too small*/
+    TMPI_ERR_XFER_BUF_OVERLAP,   /*!< buffer overlaps (thread error?)*/
+    TMPI_ERR_SEND_DEST,          /*!< Faulty send destination */
+    TMPI_ERR_RECV_SRC,           /*!< Faulty receive source */
+    TMPI_ERR_BUF,                /*!< Invalid buffer */
+    TMPI_ERR_MULTI_MISMATCH,     /*!< Comm not the same in collective call*/
+    TMPI_ERR_OP_FN,              /*!< Invalid reduce operator*/
+    TMPI_ERR_ENVELOPES,          /*!< out of envelopes (tMPI internal) */
+    TMPI_ERR_REQUESTS,           /*!< out of requests (tMPI internal) */
+    TMPI_ERR_COPY_NBUFFERS,      /*!< out of copy buffers (tMPI internal)*/
+    TMPI_ERR_COPY_BUFFER_SIZE,   /*!< copy buffer size err (tMPI internal)*/
+    TMPI_ERR_IN_STATUS,          /*!< error code in tMPI_Status */
+    TMPI_ERR_PROCNR,             /*!< Hardware processor number (such as for
+                                      thread affinity) error */
+    TMPI_FAILURE,                /*!< Transmission failure */
+    TMPI_ERR_UNKNOWN,            /*!< Unknown error */
+    N_TMPI_ERR                   /* this must be the last one */
 };
 
 /** Maximum length of error string for tMPI_Error_string() */
index 84866e8b8be962804f2ab99ccb8b845e9e32e605..e5fb5c73dab3ba24bae76e35288f80da3b389b80 100644 (file)
@@ -134,7 +134,7 @@ enum {
     F_VTEMP_NOLONGERUSED,
     F_PDISPCORR,
     F_PRES,
-    F_DHDL_CON,
+    F_DVDL_CONSTR,
     F_DVDL,
     F_DKDL,
     F_DVDL_COUL,
index 40a1c5d22e81a9ae67cdf1bfd20edb2b2ac0deea..278c8de8f388246eed1a626ea9393b764ceee60a 100644 (file)
@@ -851,6 +851,8 @@ void sum_dhdl(gmx_enerdata_t *enerd, real *lambda, t_lambda *fepvals)
      * which is a very good approximation (except for exotic settings).
      * (investigate how to overcome this post 4.6 - MRS)
      */
+    enerd->term[F_DVDL_BONDED] += enerd->term[F_DVDL_CONSTR];
+    enerd->term[F_DVDL_CONSTR] = 0;
 
     for (i = 0; i < fepvals->n_lambda; i++)
     {                                         /* note we are iterating over fepvals here!
index 359ec5f8e538ea6b209a5dea348705fb6828e7b7..cd99ec2a0f7f0859337fc889201bacba55536208 100644 (file)
@@ -316,7 +316,7 @@ void init_em(FILE *fplog, const char *title,
              gmx_mdoutf_t **outf, t_mdebin **mdebin)
 {
     int  start, homenr, i;
-    real dvdlambda;
+    real dvdl_constr;
 
     if (fplog)
     {
@@ -432,11 +432,11 @@ void init_em(FILE *fplog, const char *title,
         if (!ir->bContinuation)
         {
             /* Constrain the starting coordinates */
-            dvdlambda = 0;
+            dvdl_constr = 0;
             constrain(PAR(cr) ? NULL : fplog, TRUE, TRUE, constr, &(*top)->idef,
                       ir, NULL, cr, -1, 0, mdatoms,
                       ems->s.x, ems->s.x, NULL, fr->bMolPBC, ems->s.box,
-                      ems->s.lambda[efptFEP], &dvdlambda,
+                      ems->s.lambda[efptFEP], &dvdl_constr,
                       NULL, NULL, nrnb, econqCoord, FALSE, 0, 0);
         }
     }
@@ -551,7 +551,7 @@ static void do_em_step(t_commrec *cr, t_inputrec *ir, t_mdatoms *md,
     int      i;
     int      start, end;
     rvec    *x1, *x2;
-    real     dvdlambda;
+    real     dvdl_constr;
 
     s1 = &ems1->s;
     s2 = &ems2->s;
@@ -649,11 +649,11 @@ static void do_em_step(t_commrec *cr, t_inputrec *ir, t_mdatoms *md,
     if (constr)
     {
         wallcycle_start(wcycle, ewcCONSTR);
-        dvdlambda = 0;
+        dvdl_constr = 0;
         constrain(NULL, TRUE, TRUE, constr, &top->idef,
                   ir, NULL, cr, count, 0, md,
                   s1->x, s2->x, NULL, bMolPBC, s2->box,
-                  s2->lambda[efptBONDED], &dvdlambda,
+                  s2->lambda[efptBONDED], &dvdl_constr,
                   NULL, NULL, nrnb, econqCoord, FALSE, 0, 0);
         wallcycle_stop(wcycle, ewcCONSTR);
     }
@@ -694,7 +694,7 @@ static void evaluate_energy(FILE *fplog, gmx_bool bVerbose, t_commrec *cr,
     gmx_bool bNS;
     int      nabnsb;
     tensor   force_vir, shake_vir, ekin;
-    real     dvdlambda, prescorr, enercorr, dvdlcorr;
+    real     dvdl_constr, prescorr, enercorr, dvdlcorr;
     real     terminate = 0;
 
     /* Set the time to the initial time, the time does not change during EM */
@@ -789,17 +789,17 @@ static void evaluate_energy(FILE *fplog, gmx_bool bVerbose, t_commrec *cr,
     {
         /* Project out the constraint components of the force */
         wallcycle_start(wcycle, ewcCONSTR);
-        dvdlambda = 0;
+        dvdl_constr = 0;
         constrain(NULL, FALSE, FALSE, constr, &top->idef,
                   inputrec, NULL, cr, count, 0, mdatoms,
                   ems->s.x, ems->f, ems->f, fr->bMolPBC, ems->s.box,
-                  ems->s.lambda[efptBONDED], &dvdlambda,
+                  ems->s.lambda[efptBONDED], &dvdl_constr,
                   NULL, &shake_vir, nrnb, econqForceDispl, FALSE, 0, 0);
         if (fr->bSepDVDL && fplog)
         {
-            fprintf(fplog, sepdvdlformat, "Constraints", t, dvdlambda);
+            fprintf(fplog, sepdvdlformat, "Constraints", t, dvdl_constr);
         }
-        enerd->term[F_DVDL_BONDED] += dvdlambda;
+        enerd->term[F_DVDL_CONSTR] += dvdl_constr;
         m_add(force_vir, shake_vir, vir);
         wallcycle_stop(wcycle, ewcCONSTR);
     }
@@ -2374,7 +2374,7 @@ double do_steep(FILE *fplog, t_commrec *cr,
     gmx_global_stat_t gstat;
     t_graph          *graph;
     real              stepsize, constepsize;
-    real              ustep, dvdlambda, fnormn;
+    real              ustep, fnormn;
     gmx_mdoutf_t     *outf;
     t_mdebin         *mdebin;
     gmx_bool          bDone, bAbort, do_x, do_f;
index 81d9a2ba615439397c765819602062f70eaa062f..f972179326c42d49df7b8cfdd37d9836b5d9ce23 100644 (file)
@@ -198,7 +198,7 @@ double do_md(FILE *fplog, t_commrec *cr, int nfile, const t_filenm fnm[],
     gmx_bool          bResetCountersHalfMaxH = FALSE;
     gmx_bool          bVV, bIterativeCase, bFirstIterate, bTemp, bPres, bTrotter;
     gmx_bool          bUpdateDoLR;
-    real              mu_aver = 0, dvdl;
+    real              mu_aver = 0, dvdl_constr;
     int               a0, a1, gnx = 0, ii;
     atom_id          *grpindex = NULL;
     char             *grpname;
@@ -1259,9 +1259,7 @@ double do_md(FILE *fplog, t_commrec *cr, int nfile, const t_filenm fnm[],
                 bOK = TRUE;
                 if (!bRerunMD || rerun_fr.bV || bForceUpdate)     /* Why is rerun_fr.bV here?  Unclear. */
                 {
-                    dvdl = 0;
-
-                    update_constraints(fplog, step, &dvdl, ir, ekind, mdatoms,
+                    update_constraints(fplog, step, NULL, ir, ekind, mdatoms,
                                        state, fr->bMolPBC, graph, f,
                                        &top->idef, shake_vir, NULL,
                                        cr, nrnb, wcycle, upd, constr,
@@ -1356,7 +1354,6 @@ double do_md(FILE *fplog, t_commrec *cr, int nfile, const t_filenm fnm[],
 
             if (bTrotter && !bInitStep)
             {
-                enerd->term[F_DVDL_BONDED] += dvdl;        /* only add after iterations */
                 copy_mat(shake_vir, state->svir_prev);
                 copy_mat(force_vir, state->fvir_prev);
                 if (IR_NVT_TROTTER(ir) && ir->eI == eiVV)
@@ -1371,12 +1368,6 @@ double do_md(FILE *fplog, t_commrec *cr, int nfile, const t_filenm fnm[],
             {
                 copy_rvecn(cbuf, state->v, 0, state->natoms);
             }
-
-            if (fr->bSepDVDL && fplog && do_log)
-            {
-                fprintf(fplog, sepdvdlformat, "Constraint", 0.0, dvdl);
-            }
-            enerd->term[F_DVDL_BONDED] += dvdl;
         }
 
         /* MRS -- now done iterating -- compute the conserved quantity */
@@ -1641,7 +1632,7 @@ double do_md(FILE *fplog, t_commrec *cr, int nfile, const t_filenm fnm[],
                 /* if we have constraints, we have to remove the kinetic energy parallel to the bonds */
                 if (constr && bIfRandomize)
                 {
-                    update_constraints(fplog, step, &dvdl, ir, ekind, mdatoms,
+                    update_constraints(fplog, step, NULL, ir, ekind, mdatoms,
                                        state, fr->bMolPBC, graph, f,
                                        &top->idef, tmp_vir, NULL,
                                        cr, nrnb, wcycle, upd, constr,
@@ -1678,10 +1669,11 @@ double do_md(FILE *fplog, t_commrec *cr, int nfile, const t_filenm fnm[],
             copy_mat(state->box, lastbox);
 
             bOK = TRUE;
+            dvdl_constr = 0;
+
             if (!(bRerunMD && !rerun_fr.bV && !bForceUpdate))
             {
                 wallcycle_start(wcycle, ewcUPDATE);
-                dvdl = 0;
                 /* UPDATE PRESSURE VARIABLES IN TROTTER FORMULATION WITH CONSTRAINTS */
                 if (bTrotter)
                 {
@@ -1741,7 +1733,7 @@ double do_md(FILE *fplog, t_commrec *cr, int nfile, const t_filenm fnm[],
                               ekind, M, wcycle, upd, bInitStep, etrtPOSITION, cr, nrnb, constr, &top->idef);
                 wallcycle_stop(wcycle, ewcUPDATE);
 
-                update_constraints(fplog, step, &dvdl, ir, ekind, mdatoms, state,
+                update_constraints(fplog, step, &dvdl_constr, ir, ekind, mdatoms, state,
                                    fr->bMolPBC, graph, f,
                                    &top->idef, shake_vir, force_vir,
                                    cr, nrnb, wcycle, upd, constr,
@@ -1774,7 +1766,7 @@ double do_md(FILE *fplog, t_commrec *cr, int nfile, const t_filenm fnm[],
                      * to numerical errors, or are they important
                      * physically? I'm thinking they are just errors, but not completely sure.
                      * For now, will call without actually constraining, constr=NULL*/
-                    update_constraints(fplog, step, &dvdl, ir, ekind, mdatoms,
+                    update_constraints(fplog, step, NULL, ir, ekind, mdatoms,
                                        state, fr->bMolPBC, graph, f,
                                        &top->idef, tmp_vir, force_vir,
                                        cr, nrnb, wcycle, upd, NULL,
@@ -1788,9 +1780,9 @@ double do_md(FILE *fplog, t_commrec *cr, int nfile, const t_filenm fnm[],
 
                 if (fr->bSepDVDL && fplog && do_log)
                 {
-                    fprintf(fplog, sepdvdlformat, "Constraint dV/dl", 0.0, dvdl);
+                    fprintf(fplog, sepdvdlformat, "Constraint dV/dl", 0.0, dvdl_constr);
                 }
-                enerd->term[F_DVDL_BONDED] += dvdl;
+                enerd->term[F_DVDL_CONSTR] += dvdl_constr;
             }
             else if (graph)
             {
@@ -1868,7 +1860,7 @@ double do_md(FILE *fplog, t_commrec *cr, int nfile, const t_filenm fnm[],
         }
 
         /* only add constraint dvdl after constraints */
-        enerd->term[F_DVDL_BONDED] += dvdl;
+        enerd->term[F_DVDL_CONSTR] += dvdl_constr;
         if (!bVV || bRerunMD)
         {
             /* sum up the foreign energy and dhdl terms for md and sd. currently done every step so that dhdl is correct in the .edr */